示例#1
0
def parse_dates(df, format):
    """
    Parses dateinto year,month,day
    :param df: input df
    :param format: the format of the timestamp
    :return: dataframe
    """
    return df.withColumn('parsed_date',
                         f.to_timestamp(f.col('transaction_date'), format)) \
        .withColumn("year", f.year(f.col('parsed_date'))) \
        .withColumn("month", f.month(f.col('parsed_date'))) \
        .withColumn("day", f.dayofmonth(f.col('parsed_date'))) \
        .withColumn("unix_ts", f.unix_timestamp('parsed_date')) \
        .drop("transaction_date")
示例#2
0
def process_log_data(spark, input_data, output_data):
    """
    This function process all event logs of the Sparkify app.
    :param spark:
    :param input_data:
    :param output_data:
    :return:
    """
    # get filepath to log data file
    log_data = input_data + "log_data/*/*"

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.where(df.page == 'NextSong')

    # extract columns for users table
    users_table = (df.select(
        col('userId').alias('user_id'),
        col('firstName').alias('first_name'),
        col('lastName').alias('last_name'),
        col('gender').alias('gender'),
        col('level').alias('level')).distinct())

    users_table = users_table.orderBy(
        "ts", ascending=False).dropDuplicates(subset=["userId"]).drop('ts')
    # write users table to parquet files
    #users_table.write.parquet(output_data + "users.parquet", mode="overwrite")
    users_table.write.parquet(os.path.join(output_data, 'users.parquet'),
                              'overwrite')

    # create timestamp column from original timestamp column
    #df =
    df = df.withColumn(
        "ts_timestamp",
        F.to_timestamp(
            F.from_unixtime((col("ts") / 1000),
                            'yyyy-MM-dd HH:mm:ss.SSS')).cast("Timestamp"))

    def get_weekday(date):
        """
        This function gets weekday from date
        :param date:
        :return weekday:
        """
        import datetime
        import calendar
        date = date.strftime("%m-%d-%Y")  # , %H:%M:%S
        month, day, year = (int(x) for x in date.split('-'))
        weekday = datetime.date(year, month, day)
        return calendar.day_name[weekday.weekday()]

    udf_week_day = udf(get_weekday, T.StringType())

    # extract columns to create time table
    time_table = (df.withColumn("hour", hour(col("ts_timestamp"))).withColumn(
        "day", dayofmonth(col("ts_timestamp"))).withColumn(
            "week", weekofyear(col("ts_timestamp"))).withColumn(
                "month", month(col("ts_timestamp"))).withColumn(
                    "year", year(col("ts_timestamp"))).withColumn(
                        "weekday", udf_week_day(col("ts_timestamp"))).select(
                            col("ts_timestamp").alias("start_time"),
                            col("hour"), col("day"), col("week"), col("month"),
                            col("year"), col("weekday")).distinct())

    time_table = time_table.drop_duplicates(subset=['start_time'])
    # write time table to parquet files partitioned by year and month
    #time_table.write.parquet(output_data + "time.parquet", mode="overwrite")
    time_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'time.parquet'), 'overwrite')

    # read in song data to use for songplays table
    song_df = spark.read.parquet(output_data + "songs.parquet")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = (df.withColumn(
        "songplay_id", F.monotonically_increasing_id()).join(
            song_df, song_df.title == df.song).select(
                "songplay_id",
                col("ts_timestamp").alias("start_time"),
                col("userId").alias("user_id"), "level", "song_id",
                "artist_id",
                col("sessionId").alias("session_id"), "location",
                col("userAgent").alias("user_agent")).distinct())

    # write songplays table to parquet files partitioned by year and month
    #songplays_table.write.parquet(output_data + "songplays.parquet", mode="overwrite")
    songplays_table.write.parquet(
        os.path.join(output_data, 'songplays.parquet'), 'overwrite')
示例#3
0
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Window, Row

# File paths
source_user_path = "s3://polakowo-yelp2/yelp_dataset/user.json"
target_users_path = "s3://polakowo-yelp2/staging_data/users"

user_df = spark.read.json(source_user_path)

# Drop fields which will be outsourced and cast timestamp field
users_df = user_df.drop("elite", "friends")\
    .withColumn("yelping_since", F.to_timestamp("yelping_since"))

users_df.write.parquet(target_users_path, mode="overwrite")
示例#4
0
    StructField("tip_amount", DoubleType(), True), \
    StructField("tolls_amount", DoubleType(), True), \
    StructField("improvement_surcharge", DoubleType(), True), \
    StructField("total_amount", DoubleType(), True), \
    StructField("congestion_surcharge", DoubleType(), True)
                     ])
df = sc.read.format("csv").options(header='True').schema(schema).load(
    "../../dan606/nyctaxi/trip data/yellow_tripdata_2019-08.csv")
# WARNING "WARN  ObjectStore:568 - Failed to get database global_temp, returning NoSuchObjectException" CAN BE IGNORED

df.printSchema()

# handle dates AND time
df = df.withColumn(
    'pickup_time',
    fun.to_timestamp('tpep_pickup_datetime', "yyyy-MM-dd HH:mm:ss"))
df = df.withColumn('pickup_hour', fun.hour("pickup_time"))

## ML: classification with Decision Trees

# Predicting the 'payment_type' value from other features of the Taxi data
# https://www1.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf
pred_col = ["trip_distance", "pickup_hour", "passenger_count"]
resp_var = 'RateCodeID'  # trip categories
dffeat = df.na.drop()
vector_assembler = VectorAssembler(
    inputCols=pred_col,
    outputCol='features')  #Create pipeline and pass it to stages
pipeline = Pipeline(stages=[vector_assembler])
df_transformed = pipeline.fit(dffeat).transform(dffeat)
df_input = df_transformed.select(resp_var, 'features').withColumnRenamed(
print("Streaming DF: " + str(streamingDF.isStreaming))
print("Static DF: " + str(staticDF.isStreaming))

# COMMAND ----------

#2. Create a processing statement ("action")
#real-time time difference between arrival_time and creation_time aggregated by user and device.
# change format of "Arrival_Time", "Creation_Time" to timestamp
from pyspark.sql.functions import col, to_timestamp, datediff
streamingDF = streamingDF\
         .withColumn("Arrival_Time", col("Arrival_Time").cast("long"))\
         .withColumn("Creation_Time", col("Creation_Time").cast("long"))

streamingDF = streamingDF\
         .withColumn("Arrival_Time2",to_timestamp(streamingDF['Arrival_Time']))\
         .withColumn("Creation_Time2",to_timestamp(streamingDF['Creation_Time']))\
         .withColumn("time_diff", datediff(col("Arrival_Time2"), col("Creation_Time2")) )

timeDiff = streamingDF.groupBy('User', 'Device').sum('time_diff')

# COMMAND ----------

timeDiff.show()

# COMMAND ----------

#Set shuffle partitions to a small value to avoid creating too many shuffle partitions
spark.conf.set("spark.sql.shuffle.partitions", 5)

# COMMAND ----------
spark = SparkSession.builder.config(
    "spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0").getOrCreate()

df = spark.read.csv("s3a://udacity-dend/pagila/payment/payment.csv")
print(df.printSchema())
print(df.show(5))

df = spark.read.csv("s3a://udacity-dend/pagila/payment/payment.csv",
                    sep=';',
                    inferSchema=True,
                    header=True)
print(df.printSchema())
print(df.show(5))

df_payment = df.withColumn("payment_date", F.to_timestamp('payment_date'))
print(df_payment.printSchema())
print(df_payment.show(5))

df_payment = df_payment.withColumn("month", F.month('payment_date'))
print(df_payment.show(5))

df_payment.createOrReplaceTempView("payment")
print(
    spark.sql(
        "select month, sum(amount) as revenue from payment group by month order by revenue desc"
    ).show())

#fix schema
paymentSchema = StructType([
    StructField("payment_id", IntegerType()),
    'content_size', F.expr('cast(content_size as int)'))
dfClean.limit(5).toPandas()

#UDF DATETIME


@F.udf(StringType())
def dateTimeUDF(row):

    pattern_date = '[0-9]{2}/[A-Z][a-z]{2}/[0-9]{4}'
    pattern_time = '\:[0-9]{2}\:[0-9]{2}\:[0-9]{2}'

    match_date = re.search(pattern_date, row).group(0)
    match_time = re.search(pattern_time, row).group(0)

    date = match_date.replace('Jul', '07')
    time = match_time[1:]

    return date + ' ' + time


#FORMAT DATE

dfClean_Date = dfClean.withColumn('date_time', dateTimeUDF('date_time'))
dfClean_Date.limit(5).show()

dfClean_Date = dfClean_Date.withColumn(
    'date_time', F.to_timestamp('date_time', 'dd/MM/yyyy HH:mm:ss'))
dfClean_Date.limit(5).show()
dfClean_Date.printSchema()
示例#8
0
def run_spark_job(spark):
    configure_logging(spark)
    # TODO Create Spark Configuration
    # Create Spark configurations with max offset of 200 per trigger
    # set up correct bootstrap server and port
    # Ref: https://spark.apache.org/docs/2.2.0/structured-streaming-kafka-integration.html
    df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "udacity.sf.police.crime.v2") \
        .option("startingOffsets", "earliest") \
        .option("maxOffsetsPerTrigger", 200) \
        .option("stopGracefullyOnShutdown", "true") \
        .load()

    # Show schema for the incoming resources for checks
    df.printSchema()

    # TODO extract the correct column from the kafka input resources
    # Take only value and convert it to String
    kafka_df = df.selectExpr("CAST(value AS STRING)")

    # kafka_df.writeStream.format("console").outputMode("append").start()

    service_table = kafka_df \
        .select(psf.from_json(psf.col('value'), schema).alias("DF")) \
        .select("DF.*")

    # service_table.writeStream.format("console").outputMode("append").start()

    # I have seen there ara some rows with null values in both, original_crime_type
    # and disposition, so I will filter them out.

    service_table_non_nulls = service_table.na.drop(
        subset=["original_crime_type_name", "disposition"])

    # TODO select original_crime_type_name and disposition
    # I'm using pst.to_timestamp to convert the string timestamp into a timestamp object so we can use it later
    # to do watermarking and windowed aggregations.

    distinct_table = service_table_non_nulls.select(
        "original_crime_type_name", "disposition",
        psf.to_timestamp("call_date_time").alias(
            "call_date_time_ts")).distinct()
    # distinct_table.writeStream.format("console").outputMode("append").start()

    # count the number of original crime type
    # Nice blog about watermarking
    # https://databricks.com/blog/2017/05/08/event-time-aggregation-watermarking-apache-sparks-structured-streaming.html
    # Discarding events that arrive more than 10 minutes late. I don't want to set a huge watermark to avoid having
    # memory issues
    agg_df = distinct_table \
        .select("original_crime_type_name", "disposition", "call_date_time_ts") \
        .withWatermark("call_date_time_ts", "10 minutes") \
        .groupBy("original_crime_type_name",
                 psf.window("call_date_time_ts", "10 minutes", "5 minutes"),
                 "disposition"  # Including this field so I can run the aggregation later.
                 ) \
        .count()

    # TODO Q1. Submit a screen shot of a batch ingestion of the aggregation
    # TODO write output stream

    query = agg_df \
        .writeStream \
        .queryName("Original Crime Type Count Aggregation")\
        .trigger(processingTime="30 seconds") \
        .format('console') \
        .option("truncate", "false") \
        .start()

    # TODO attach a ProgressReporter
    #query.awaitTermination()

    # TODO get the right radio code json path
    radio_code_json_filepath = "./radio_code.json"
    radio_code_df = spark.read. \
        option("multiline", "true"). \
        json(radio_code_json_filepath, radio_code_schema)

    # clean up your data so that the column names match on radio_code_df and agg_df
    # we will want to join on the disposition code

    # TODO rename disposition_code column to disposition
    radio_code_df = radio_code_df.withColumnRenamed("disposition_code",
                                                    "disposition")

    radio_code_df.printSchema()
    # TODO join on disposition column
    # Nice blog on joins: https://luminousmen.com/post/introduction-to-pyspark-join-types
    # In this case, if we use an inner join unless the disposition is on the radio dataframe we
    # wont see any results, so I will be using a left join, because I want to see the data on the agg even
    # if there is no a matching disposition
    join_query = agg_df.join(radio_code_df, on='disposition', how='left')

    query_join = join_query \
        .writeStream \
        .queryName("Join with radio codes")\
        .trigger(processingTime="30 seconds") \
        .format('console') \
        .option("truncate", "false") \
        .start()
    query_join.awaitTermination()
def process_log_data(spark, input_data, output_data):
    '''
    Process the log data from the file(s) specified in the parameters.
    
    Args:
        spark: the spark session
        input_data: 
        output_data:
    
    Returns:
        modeled data from logs and songs json files that are written to parquet files back on S3
    '''
    # get filepath to log data file
    log_data = input_data + "log_data/*/*"

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.where(df.page == 'NextSong')

    # extract columns for users table
    users_table = df.select(
        col('userId').alias('user_id'),
        col('firstName').alias('first_name'),
        col('lastName').alias('last_name'),
        col('gender').alias('gender'),
        col('level').alias('level')).distinct()

    # write users table to parquet files
    users_table.write.parquet(output_data + "users.parquet", mode="overwrite")

    # create timestamp column from original timestamp column
    df = df.withColumn(
        'timestamp',
        f.to_timestamp(
            f.from_unixtime((col('ts') / 1000),
                            'yyyy-MM-dd HH:mm:ss.SSS')).cast("Timestamp"))

    # create datetime column from original timestamp column
    df = df.withColumn('ts_datetime',
                       f.to_datetime(col['ts']).cast('Datetime'))

    # extract columns to create time table
    time_table = df.withColumn("hour", hour(col("timestamp"))) \
          .withColumn("day", dayofmonth(col("timestamp"))) \
          .withColumn("week", weekofyear(col("timestamp"))) \
          .withColumn("month", month(col("timestamp"))) \
          .withColumn("year", year(col("timestamp"))) \
          .withColumn("weekday", datetime.datetime(col("timestamp")).weekday()) \
          .select(
            col("timestamp").alias("start_time"),
            col("hour"),
            col("day"),
            col("week"),
            col("month"),
            col("year"),
            col("weekday")
          )

    # write time table to parquet files partitioned by year and month
    time_table.parquet(output_data + "time.parquet", mode="overwrite")

    # read in song data to use for songplays table
    song_df = spark.read.parquet(output_data + "songs.parquet")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.withColumn(
        'songplay_id', F.monontonically_increasing_id()).join(
            song_df, song_df.title == df.song).select(
                'songplay_id',
                col().alias('start_time'),
                col('userId').alias('user_id'), 'level', 'song_id',
                'artist_id',
                col('sessionId').alias('session_id'), 'location',
                col('userAgent').alias('user_agent'))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.parquet(output_data + "songplays.parquet",
                                  mode="overwrite")
示例#10
0
def run_spark_application():
    # Creates session and spark context

    sc = SparkContext(appName="Stocks")
    spark = SQLContext.getOrCreate(sc)

    amazonDataFrame = createDataFrame(spark, "amazon.csv")
    amazonInfo = selectInfoFromDataFrame(amazonDataFrame, "amazon")

    googDataFrame = createDataFrame(spark, "google.csv")
    googInfo = selectInfoFromDataFrame(googDataFrame, "google")

    facebookDataFrame = createDataFrame(spark, "facebook.csv")
    facebookInfo = selectInfoFromDataFrame(facebookDataFrame, "facebook")

    # Collect all Date and closing into one dataFrame
    dataTable = amazonInfo.join(
        googInfo, amazonInfo.amazonDate == googInfo.googleDate).select(
            "amazonDate", "closeAmazon", "closeGoogle")
    dataTable = dataTable.join(
        facebookInfo,
        dataTable.amazonDate == facebookInfo.facebookDate).select(
            dataTable["amazonDate"].alias("date"), "closeAmazon",
            "closeGoogle", "closeFacebook")

    # We want to format the data into the format such that first column is all date, second column is symbols and last
    # column is all about the closing price of that day
    amazFormatted = selectInfoAsNewNames(dataTable, "amazon")
    faceBookFormatted = selectInfoAsNewNames(dataTable, "facebook")
    googFormatted = selectInfoAsNewNames(dataTable, "google")
    # We union the columns together, then reorder them by dates
    formattedDataTable = amazFormatted.union(faceBookFormatted).union(
        googFormatted)
    formattedDataTable = formattedDataTable.orderBy(
        formattedDataTable.date.asc())

    # We construct the final DataFrame
    # 1: We add timestamp and price as two new columns based on date and closing Price
    finalDf = formattedDataTable.withColumn(
        "timestamp", to_timestamp(formattedDataTable.date)).withColumn(
            "price", formattedDataTable["closingPrice"].cast("double"))
    # 2: After that we drop the original price and closingPrice
    finalDf = finalDf.drop("date", "closingPrice").sort("timestamp")
    finalDf.registerTempTable("preData")
    finalDf.show()

    # We gather the necessary data to create a time series RDD
    minDate = finalDf.selectExpr(
        "min(timestamp)").collect()[0]["min(timestamp)"]
    maxDate = finalDf.selectExpr("max(timestamp)").alias(
        "timestamp").collect()[0]["max(timestamp)"]
    frequency = DayFrequency(1, sc)

    dtIndex = datetimeindex.DateTimeIndex.uniform(start=minDate,
                                                  end=maxDate,
                                                  freq=frequency,
                                                  sc=sc)
    tsRdd = timeseriesrdd.time_series_rdd_from_observations(
        dtIndex, finalDf, "timestamp", "symbol", "price")

    # Last step BRO, we perform the prediction
    df = tsRdd.map_series(train_transform_func)

    # Let's avoid the zone check in python here. it is way too annoying if we care about that
    finalDf.show()
    spark.stop()
示例#11
0
def process_log_data(spark, input_data, output_data):
    """
    Process log_data from input_data path and save users, time and songplays tables in paquet format in output_data path
    
    Parameters:
        spark: SparkSession object to process data
        input_data: path to input data
        output_data: path to output data
    """
    
    # get filepath to log data file
    log_data = input_data + 'log_data/*'

    # read log data file
    log_df = spark.read.json(log_data)
    
    # filter by actions for song plays 
    log_df = log_df.filter('page = "NextSong"') \
                   .withColumn('user_id', log_df['userId'].cast('integer')) \
                   .withColumn('session_id', log_df['sessionId'].cast('integer')) \
                   .withColumnRenamed('firstName', 'first_name') \
                   .withColumnRenamed('lastName', 'last_name')

    # extract columns for users table    
    users_table = log_df[['user_id', 'first_name', 'last_name', 'gender', 'level']]
    
    # write users table to parquet files
    users_table.where(users_table.user_id.isNotNull()).distinct().write.mode('overwrite').parquet(output_data + 'users/')

    # create timestamp column from original timestamp column
    time_df = log_df[['ts']]
    
    # create datetime column from original timestamp column
    time_df = time_df.withColumn('ts', to_timestamp(col('ts')/1000))
    
    # extract columns to create time table
    time_table = time_df.withColumnRenamed('ts', 'start_time') \
                        .withColumn('hour', hour(col('start_time'))) \
                        .withColumn('day', dayofmonth(col('start_time'))) \
                        .withColumn('week', weekofyear(col('start_time'))) \
                        .withColumn('month', month(col('start_time'))) \
                        .withColumn('year', year(col('start_time'))) \
                        .withColumn('weekday', date_format(col('start_time'), 'u').cast('integer'))
     
    # write time table to parquet files partitioned by year and month
    time_table.distinct().write.partitionBy('year', 'month').mode('overwrite').parquet(output_data + 'time/')

    # read in song data to use for songplays table
    song_df = spark.read.json(input_data + 'song_data/*/*/*')

    # extract columns from joined song and log datasets to create songplays table 
    songplays_table = log_df.join(song_df, [log_df.song == song_df.title, log_df.artist == song_df.artist_name]) \
                             .selectExpr('monotonically_increasing_id() as songplay_id', \
                                         'to_timestamp(ts/1000) as start_time', \
                                         'month(to_timestamp(ts/1000)) as month', \
                                         'year(to_timestamp(ts/1000)) as year', \
                                         'user_id as user_id', \
                                         'level as level', \
                                         'song_id as song_id', \
                                         'artist_id as artist_id', \
                                         'session_id as session_id', \
                                         'location as location', \
                                         'userAgent as user_agent') 

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.mode('overwrite').partitionBy('year', 'month').parquet(output_data+'songplays/')
示例#12
0
import pyspark.sql.functions as F
import pyspark.sql.types as T

spark = SparkSession.builder.enableHiveSupport()\
.config("spark.sql.parquet.writeLegacyFormat",True)\
.getOrCreate()

df = spark.read.csv(
    "hdfs://hive-namenode:8020/user/sqoop/order_detail/part-m-00000",
    header=False)
rename = {
    '_c0': 'order_created_timestamp',
    '_c1': 'status',
    '_c2': 'price',
    '_c3': 'discount',
    '_c4': 'id',
    '_c5': 'driver_id',
    '_c6': 'user_id',
    '_c7': 'restaurant_id',
}
df = df.toDF(*[rename[c] for c in df.columns])
df = df.withColumn('order_created_timestamp',
                   F.to_timestamp('order_created_timestamp'))
df = df.withColumn('dt', F.date_format('order_created_timestamp', "yyyyMMdd"))
df = df.withColumn('price', F.col('price').cast(T.IntegerType()))
df = df.withColumn('discount', F.col('discount').cast(T.FloatType()))
df.write.parquet(
    'hdfs://hive-namenode:8020/user/spark/transformed_order_detail',
    partitionBy='dt',
    mode='overwrite')
示例#13
0
def summary_df(df,fn): #,max_date):
    # drop null ad_click values
    df = df.na.drop(subset=["ad_click"])
    # Remove non search sessions
    df = df[df['ad_click']>0]

    # sum ad_click
    sum_search_clients_daily = df.groupBy("client_id", "country", "submission_date_s3", "activity_date")\
                                        .agg(F.sum("ad_click").alias("ad_click"))
        
    # read revenue_by_country
    rev_by_country_s3_path = "s3://net-mozaws-prod-us-west-2-pipeline-analysis/nawong/revenue_by_country.csv"
    rev_by_country = sqlContext.read.csv(rev_by_country_s3_path, header=True)
    rev_by_country = rev_by_country.withColumn("rev_per_search_float", F.col("rev_per_search").cast("double"))\
                               .withColumn("yyyyMM_timestamp", F.to_timestamp(F.col("yyyymm"), "yyyyMM"))\
                               .withColumn("country_code", F.upper(F.col("country_code")))

    # add country field and revenue table - need transform to calculate transaction-level monetary value
    tbl = sum_search_clients_daily.join(rev_by_country, sum_search_clients_daily.country == rev_by_country.country_code,how='left_outer')
    spec = Window.partitionBy("client_id","country","submission_date_s3").orderBy(F.col("yyyyMM_timestamp").desc())
    # NOTE partition includes country because client may change country over time

    no_country=(
    tbl
        .where(F.isnull(F.col("yyyymm")))
        .withColumn("rev_per_search_float", F.lit(.005))
    )

    has_country=(
    tbl
        .na.drop(subset=["yyyymm"])
        .where("yyyyMM_timestamp <= activity_date")
        .withColumn('rank', F.row_number().over(spec))
        .where("rank = 1")
        .drop('rank')
    )

    tbl2=(no_country.union(has_country))

    # drop first purchase to calculate revenue
    spec2 = Window.partitionBy("client_id").orderBy(F.col("activity_date").asc()) # earliest date has row #1
    search_rev = (tbl2
     .withColumn("rank", F.row_number().over(spec2))
     .where("rank > 1")
    ).groupBy("client_id").agg(F.avg(F.col('rev_per_search_float')*F.col('ad_click')).alias("monetary_value"))
    
    # compute the final dataset for the BG/NBD model
    dataset = (
        tbl2
        .groupBy("client_id")
        .agg(F.datediff(F.max('activity_date'),F.min("activity_date")).alias("recency"), 
             (F.countDistinct('activity_date')-1).alias("frequency"),
             (F.datediff(F.lit(end_date.strftime("%Y-%m-%d")).cast("date"),F.min("activity_date"))).alias("T"),
             F.sum("ad_click").alias("historical_searches"),
             F.sum(F.col('rev_per_search_float')*F.col('ad_click')).alias("historical_clv"))
        .join(search_rev, "client_id", how="left")
        .where("frequency >= 0 AND recency >= 0 AND T >= 0")
        .select("client_id", (F.crc32("client_id") % 100).alias("sample_id"), "frequency","recency","T","monetary_value","historical_searches","historical_clv")
      ).fillna(0, subset=['monetary_value'])

    # anonymize client_id    
    dataset = dataset.withColumn('client_id',sha1(dataset.client_id))

    # write dataset recency, freq, age, revenue table per client
    #dataset.write.partitionBy("sample_id").format("parquet").mode("overwrite").save(fn)
    duplicated = dataset.withColumn("sample_id_dupe", dataset["sample_id"])
    duplicated.write.partitionBy("sample_id_dupe").format("parquet").mode("append").save(fn)
示例#14
0
def drop_minutes(df, colName):
    fmt = "yyyy-MM-dd HH:00:00"
    return df.withColumn(
        colName,
        to_timestamp(date_format(col(colName), fmt), "yyyy-MM-dd HH:mm:ss"))
示例#15
0
def main():
    """
    Lavora sui dati del database originale per fare ETL e caricare sulla
    dwh i dati che ci servono
    """

    spark = SparkSession \
        .builder \
        .appName('DBAnalysis') \
        .config('spark.driver.extraClassPath', 'postgresql-42.2.10.jar') \
        .getOrCreate()

    properties = {
        'driver': 'org.postgresql.Driver',
        'url': 'jdbc:postgresql://postgres:5432/postgres',
        'user': '******',
        'password': '******',
        'dbtable': ' spotify_details',
    }

    properties_dwh = {
        'driver': 'org.postgresql.Driver',
        'url': 'jdbc:postgresql://postgres_dwh:5432/postgres',
        'user': '******',
        'password': '******'
    }

    df2 = spark.read \
        .format('jdbc') \
        .option('driver', properties['driver']) \
        .option('url', properties['url']) \
        .option('user', properties['user']) \
        .option('password', properties['password']) \
        .option('dbtable', properties['dbtable']) \
        .load()

    count_tracks_distribution = df2.groupby('id_playlist')\
                                .count()

    # Creo la colonna year_month
    df3 = df2.withColumn(
        'year_month',
        date_format(to_timestamp(df2.timestamp, "yyyy-MM-dd'T'HH:mm:ssXXX"),
                    "yyyy-MM").alias('year_month'))

    # Aggrego sulla playlist e conto le ricorrenze
    df4 = df3.groupby('id_playlist', 'year_month').count()

    # Per assegnare un mese alla playlist si é deciso di scegliere il mese con maggiori "aggiunte" di canzoni
    # Per ogni playlist seleziono solo quella con ricorrenze per mese maggiore
    w = Window.partitionBy('id_playlist')

    df5 = df4.withColumn('max', f.max('count').over(w))\
        .where(f.col('count') == f.col('max'))\
        .drop('max', 'count')

    month_distribution = df5\
        .where(f.col('year_month')>="2020-01")\
        .groupby('year_month')\
        .count()

    # Il df completo ha l'informazione sul mese di riferimento assegnata ad ogni playlist
    spotify_complete = df2.join(df5, on=['id_playlist'], how='left')

    df_complete2 =  spotify_complete.groupBy("id_playlist", 'name_playlist' , 'year_month')\
                    .agg(f.mean('danceability'),f.stddev_pop('danceability'),f.mean('energy'),f.stddev_pop('energy'),f.mean('valence'),f.stddev_pop('valence'))\
                    .sort('year_month', ascending=True)

    newColumns = [
        "id_playlist", "name_playlist", "year_month", "avgdanceability",
        "stdddanceability", "avgenergy", "stddenergy", "avgvalence",
        "stddvalence"
    ]
    df_complete2 = df_complete2.toDF(*newColumns)

    df_complete3 = df_complete2.groupBy('year_month')\
    .agg(f.mean('avgdanceability'),f.mean('stdddanceability'),f.mean('avgenergy'),f.mean('stddenergy'),f.mean('avgvalence'),f.mean('stddvalence'))\
    .sort('year_month', ascending=True)

    newColumns = [
        "timestamp", "mean_danceability", "stdev_danceability", "mean_energy",
        "stdev_energy", "mean_valence", "stdev_valence"
    ]
    audiofeatures_stat = df_complete3.toDF(*newColumns)

    audiofeatures_stat.write.jdbc(url=properties_dwh['url'],
                                  table='audiofeatures_stat',
                                  mode='overwrite',
                                  properties=properties_dwh)
    spotify_complete.write.jdbc(url=properties_dwh['url'],
                                table='spotify_complete',
                                mode='overwrite',
                                properties=properties_dwh)
    month_distribution.write.jdbc(url=properties_dwh['url'],
                                  table='month_distribution',
                                  mode='overwrite',
                                  properties=properties_dwh)
    count_tracks_distribution.write.jdbc(url=properties_dwh['url'],
                                         table='count_tracks_distribution',
                                         mode='overwrite',
                                         properties=properties_dwh)
示例#16
0
from pyspark.sql import Row
import operator
from pyspark.sql.window import Window

conf = SparkConf().setAppName("Ex2").setMaster("local")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

df = sc.textFile(
    '/host/HieldshiemMasters/Semester1/DistributedDataAnalytics/Exercises/Ex9_Solution/ml-10M100K/tags.dat'
).map(lambda x: x.split("::"))

df = df.toDF(['UserID', 'MovieID', 'Tag', 'Timestamp'])
df_Update = df.withColumn('time_datestring', func.from_unixtime('timestamp'))
df_Update = df_Update.withColumn(
    'time_date', to_timestamp(df_Update.time_datestring,
                              'yyyy-MM-dd HH:mm:ss'))
#print(df_Update)
#df_Update.show()

#===== get all the time stamps for each user ========================
#test=df_Update.groupBy(['UserID'])
new = df_Update.groupBy(['UserID']).agg(collect_list("time_date"))
#test.show()
#==========sort time stamps for each user===========================
#func=udf(lambda x:sorted(x.tolist()))


def sorter(l):
    res = sorted(l)
    return [item for item in res]
示例#17
0
def rename_cols(df, mapping):
    for old, new in mapping.items():
        df = df.withColumnRenamed(old, new)
    return df


# COMMAND ----------

match = spark.read.json('dbfs:/mnt/lol/landing/europe/match/')
team = spark.read.json('dbfs:/mnt/lol/landing/europe/team/')
participant = spark.read.json('dbfs:/mnt/lol/landing/europe/participants/')

match = match\
    .withColumn('gameCreation', F.col('gameCreation') / 1000)\
    .withColumn('gameCreation', F.to_timestamp('gameCreation'))\
    .withColumn('gameStartTimestamp', F.col('gameStartTimestamp') / 1000)\
    .withColumn('gameStartTimestamp', F.to_timestamp('gameStartTimestamp'))

# COMMAND ----------

sum_cols = {
    'kills': 'kills',
    'assists': 'assists',
    'deaths': 'deaths',
    'doubleKills': 'double_kills',
    'tripleKills': 'triple_kills',
    'quadraKills': 'quadra_kills',
    'pentaKills': 'penta_kills',
    'unkilled': 'unkilled',
    'goldEarned': 'gold',
示例#18
0
    'l_traffic_dl_pktuuloss_tot_qci_8', 'l_e_rab_succest',
    'l_thrp_time_cell_ul_highprecision', 'l_e_rab_initsuccest',
    'l_e_rab_abnormrel_other_voip', 'l_traffic_dl_pktdelay_num'
]

currentTime = datetime.now()
lags = currentTime - timedelta(minutes=120)

#Change this based on the time zone difference
timdedifference = 150

Full_DF = spark.read.format("org.apache.spark.sql.cassandra")\
    .options(table='huawei_4g', keyspace='common')\
    .load()\
    .select(*allCols)\
    .withColumn('ossresulttime',func.to_timestamp("ossresulttime", "yyyy-MM-dd HH:mm"))\
    .withColumn('ossdate',func.to_timestamp("ossdate", "yyyy-MM-dd"))\
    .withColumn('osshour',col('osshour').cast('float'))\
    .filter(col("ossresulttime") > datetime.now()+timedelta(minutes=timdedifference))

combined_DF = Full_DF.select(*impCols)
maxTime = combined_DF.select(func.max('ossresulttime')).first()
combined_DF = combined_DF.filter(
    combined_DF['ossresulttime'] >= maxTime[0] - timedelta(minutes=15))

combined_DF = combined_DF.na.fill({
    'l_traffic_ul_pktloss_loss_qci_5':
    9.32716621103459e-06,
    'l_e_rab_attest_qci_2':
    5.48632216489652e-05,
    'l_thrp_time_dl_qci_2':
示例#19
0
print('# Estructura cargada')
print('> Cargando modelos de preprocesamiento')

# Preprocesamiento

columns = dataset.columns[5:]
dataset = dataset.na.fill("N/A")

# Cast string a tipos adecuados

dataset = dataset.withColumn(
    "timestamp", F.from_unixtime(F.col("timestamp"),
                                 "yyyy-MM-dd'T'HH:mm:ssXXX"))
dataset = dataset.withColumn(
    "time", F.to_timestamp(F.col("Time"), 'MM/dd/yyyy hh:mm:ss a'))
dataset = dataset.withColumn(
    "time", F.date_format(F.col("Time"), "yyyy-MM-dd'T'HH:mm:ssXXX"))

dataset = dataset.withColumn('First', F.col('First').cast(BooleanType()))
dataset = dataset.withColumn('Sequencenum',
                             F.col('Sequencenum').cast(IntegerType()))

# Creamos col hora

dataset = dataset.withColumn("hour", F.hour(F.col("time")))

# Normalizamos columna hora

dataset = dataset.withColumn("hour", (F.col("hour") - 0) / (23 - 0) * 6)
示例#20
0
def process_log_data(spark, input_data, output_data):
    """
    - Processes JSON log files stored in input location 
    - Transforms dimension tables: users, time
    - Transforms fact tables: songplays
    - Saves output to parquet files 
    
    Arguments: 
        spark -- instatiated object for spark session
        input_data (str) -- path to folder containing log files to be processed
        output_data (str) -- output path for final parquet files 
    """

    print("Log processing : Started")

    # get filepath to log data file
    log_data = input_data + 'log_data/'

    # read log data file
    df = spark.read.option("recursiveFileLookup", "true").json(log_data)

    # filter by actions for song plays
    df = df.filter(col("page") == "NextSong")

    # extract columns for users table
    users_table = df.selectExpr("userId as user_id", "firstName as first_name",
                                "lastName as last_name", "gender",
                                "level").distinct()

    # write users table to parquet files
    users_table.write.parquet(output_data + "users.parquet", mode="overwrite")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000).strftime(
        "%Y-%m-%d %H:%M:%S"))
    df = df.withColumn("timestamp", to_timestamp(get_timestamp("ts")))

    # create datetime column from original timestamp column
    get_datetime = udf(
        lambda x: datetime.fromtimestamp(x / 1000).strftime("%Y-%m-%d"))
    df = df.withColumn("date", to_date(get_datetime("ts")))

    # extract columns to create time table
    df.createOrReplaceTempView("timetable")
    time_table = spark.sql("""
            SELECT DISTINCT 
                    timestamp AS start_time, 
                    HOUR(timestamp) AS hour, 
                    DAY(timestamp) AS day, 
                    WEEKOFYEAR(timestamp) AS week, 
                    MONTH(timestamp) AS month, 
                    YEAR(timestamp) AS year, 
                    DAYOFWEEK(timestamp) AS weekday
                FROM timetable 
        """)

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year",
                                 "month").parquet(output_data + "time.parquet",
                                                  mode="overwrite")

    # read in song data to use for songplays table
    song_df = spark.read.parquet(output_data + "songs.parquet")
    artist_df = spark.read.parquet(output_data + "artists.parquet").selectExpr(
        "artist_id as ref_artist", "name")
    song_df = song_df.join(artist_df,
                           song_df.artist_id == artist_df.ref_artist)

    if song_df.count() > 0:
        # extract columns from joined song and log datasets to create songplays table
        songplays_table = df.join(song_df , (df.artist == song_df.name) & (df.song == song_df.title) , how='left')\
            .selectExpr("concat_ws('_', userId, ts) as songplay_id", "timestamp as start_time", "userId as user_id", \
                        "level", "song_id", "artist_id", "sessionId as session_id", "location", "userAgent as user_agent" )

        # write songplays table to parquet files partitioned by year and month
        songplays_table.withColumn("year", year("start_time")).withColumn("month", month("start_time"))\
            .write.partitionBy("year", "month")\
            .parquet(output_data + "songplays.parquet", mode="overwrite")

    print("Log processing : Ended")
示例#21
0
                    StructField("machineType", StringType(), True),
                    StructField("deviceId", StringType(), True),
                    StructField("type", StringType(), True),
                    StructField("status", StringType(), True),
                    StructField("timestamp", StringType(), True),
                ]))

dfParsedContents = dfAvroInput.withColumn("Body", from_json(col("Body").cast("string"), contentSchema))
dfExplodedParsedContents = dfParsedContents.withColumn("Body", explode("Body").alias("key"))

display(dfExplodedParsedContents)

# COMMAND ----------

# DBTITLE 1,Convert EnqueuedTime and unwrap body contents
dfFormatedTime = dfExplodedParsedContents.withColumn("EnqueuedTimeUtc", to_timestamp("EnqueuedTimeUtc", 'M/d/yyyy h:mm:ss a'))

dfUnwrappedContents = dfFormatedTime.withColumn("EventTimestamp", to_timestamp(col("Body").getItem("timestamp")))
dfUnwrappedContents = dfUnwrappedContents.withColumn("FactoryArea", col("Body").getItem("factoryArea"))
dfUnwrappedContents = dfUnwrappedContents.withColumn("MachineType", col("Body").getItem("machineType"))
dfUnwrappedContents = dfUnwrappedContents.withColumn("DeviceID", col("Body").getItem("deviceId"))
dfUnwrappedContents = dfUnwrappedContents.withColumn("type", col("Body").getItem("type"))
dfUnwrappedContents = dfUnwrappedContents.withColumn("Status", col("Body").getItem("status"))
dfUnwrappedContents = dfUnwrappedContents.drop("Body")

display(dfUnwrappedContents)

# COMMAND ----------

# DBTITLE 1,Keep only event information
dfAlarmEvents = dfUnwrappedContents.select("EventTimestamp", "FactoryArea", "MachineType", "DeviceID", "type", "Status")
示例#22
0
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.3.2-bin-hadoop2.7"

tweet_raw = (spark.read.format("csv").options(header="true").load("20200312_Coronavirus_Tweets_Subset.CSV"))
display(tweet_raw)
tweet_raw.show(10, False)

#### CLEAN DATA ####
# Initially filter tweets in English & create new filtered DataFrame
tweet_filter = tweet_raw.select("*", F.when(tweet_raw.lang == 'en', 'TRUE').alias('eng_true'))
tweet_filter = tweet_filter.filter("eng_true == 'TRUE'")
# Fix Date Structure
tweet_filter = tweet_filter.withColumn('created_at', regexp_replace('created_at', 'T', ' '))
tweet_filter = tweet_filter.withColumn('created_at', regexp_replace('created_at', 'Z', ''))
# Convert to Timestamp
tweet_filter = tweet_filter.withColumn('dt',to_timestamp(tweet_filter.created_at, 'yyyy-MM-dd HH:mm:ss'))
# Drop Unused Columns
tweet_filter = tweet_filter.drop('created_at','reply_to_status_id','reply_to_user_id','reply_to_screen_name','place_type','account_lang')
# Define Columns for Integer Transformation
cols = spark.createDataFrame([('status_id',1),('user_id',2),('favourites_count',3),('retweet_count',4),('followers_count',5),('friends_count',6)])
cols_col = cols.select("_1")

tweet_filter.show(10, False)

tweet_raw.printSchema()

type(tweet_raw)

# create a spark session
spark = SparkSession.builder\
                    .master("local")\
示例#23
0
# datediff('2016-01-01', '2017-01-01')
# FROM dateTable

from pyspark.sql.functions import to_date
dateFormat = "yyyy-dd-MM"
cleanDateDF = spark.range(1).select(
    to_date(lit("2017-12-11"), dateFormat).alias("date"),
    to_date(lit("2017-20-12"), dateFormat).alias("date2"))
cleanDateDF.createOrReplaceTempView("dateTable2")
# -- in SQL
# SELECT to_date(date, 'yyyy-dd-MM'), to_date(date2, 'yyyy-dd-MM'), to_date(date)
# FROM dateTable2

# in Python
from pyspark.sql.functions import to_timestamp
cleanDateDF.select(to_timestamp(col("date"), dateFormat)).show()
# -- in SQL
# SELECT to_timestamp(date, 'yyyy-dd-MM'), to_timestamp(date2, 'yyyy-dd-MM')
# FROM dateTable2

# SELECT cast(to_date("2017-01-01", "yyyy-dd-MM") as timestamp)

# Spark includes a function to allow you to select the first non-null value from a set of columns by using
# the coalesce function.
from pyspark.sql.functions import coalesce
df.select(coalesce(col("Description"), col("CustomerId"))).show()

# deal with null
# -- in SQL
# SELECT
# ifnull(null, 'return_value'),
示例#24
0
def process_log_data(spark, input_data, output_data):
    
    """
    This function takes the log data from Udacity's S3 input file and processes it. This is done by 
    extracting the user, time and songplay tables and then loading it back to the S3 buckegt I've created in AWS.
   
    Parameters:
            spark       : Spark Session
            input_data  : The S3 bucket location of song_data, think 'input'
            output_data : The S3 bucket location of the song_data, think 'ouput'
    """ 
    
    #Using print statement to understand where in spark statement we are
    print("\n Taking in log data as variable from S3's input location....")
    # get full filepath to song data file
    #log_data = input_data + 'log_data/*/*/*.json'
    #utilizing exact folder set of data set to speed up execution in WorkSpace (please use commented out log_data variable above to run full etl with wildcards)
    log_data = input_data + 'log_data/2018/11/*.json'
    
    
    #Using print statement to understand where in spark statement we are
    print("\n Defining log Schema....")
    log_schema = Struct([SFld("artist", Str()), SFld("auth", Str()),
                         SFld("firstName", Str()), SFld("gender", Str()),
                         SFld("itemInSession", Lng()), SFld("lastName", Str()),
                         SFld("length", Dbl()), SFld("level", Str()),
                         SFld("location", Str()), SFld("method", Str()),
                         SFld("page", Str()), SFld("registration", Dbl()),
                         SFld("sessionId", Lng()), SFld("song", Str()),
                         SFld("status", Str()), SFld("ts", Str()),
                         SFld("userAgent", Str()), SFld("userId", Str())])
    
    
    #Using print statement to understand where in spark statement we are
    print("\n Reading log data JSON files from S3's input location....")
    # read log data file
    df = spark.read.json(log_data, schema = log_schema, mode='PERMISSIVE', columnNameOfCorruptRecord='corruptRecord').drop_duplicates()
    
    
    #Using print statement to understand where in spark statement we are
    print("\n Filtering page by NextSong....")
    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong').drop_duplicates()

          
    #Using print statement to understand where in spark statement we are
    print("\n Creating select statement for users data creation....")     
    # extract columns for users table    
    users_table = df.select('userId', 'firstName', 'lastName', 'gender', 'level').where(df.userId != None).drop_duplicates()
    
          
    #Using print statement to understand where in spark statement we are
    print("\n Writing parquet file for users table....")
    # write users table to parquet files
    users_table.write.mode('overwrite').parquet(output_data + 'users_table/')
          
          
    #Using print statement to understand where in spark statement we are
    print("\n Creating timeStamp variable....")
    # create timestamp column from original timestamp column
    df = df.withColumn("timestamp", to_timestamp(from_unixtime(col("ts") / 1000)))
      
    
    #Using print statement to understand where in spark statement we are
    print("\n Creating select statement for time data creation....")      
    # extract columns to create time table
    time_table = ( df.select("timestamp").withColumn("hour", hour("timestamp")).withColumn("day", dayofmonth("timestamp")) \
                    .withColumn("week", weekofyear("timestamp")).withColumn("weekday", dayofweek("timestamp")).withColumn("weekdayName", date_format("timestamp", "E")) \
                    .withColumn("month", month("timestamp")).withColumn("year", year("timestamp")).drop_duplicates()
                 )
    
    
    #Using print statement to understand where in spark statement we are
    print("\n Writing parquet file for time table and partitioned by year and month....")        
    # write time table to parquet files partitioned by year and month
    time_table.write.mode('overwrite').partitionBy('year', 'month').parquet(output_data + 'time_table/')

          
    #Using print statement to understand where in spark statement we are
    print("\n Reading song data JSON files from S3's input location....")      
    # read in song data to use for songplays table
    song_df = spark.read.parquet(output_data + 'songs_table/')

          
    #Using print statement to understand where in spark statement we are
    print("\n Creating select statement for song play data creation....")       
    # extract columns from joined song and log datasets to create songplays table 
    songplays_table = df.withColumn('songplayId', F.monotonically_increasing_id()).join(song_df, song_df.title == df.song) \
                        .select('songplayId', col('timestamp').alias('start_time'), col('userId'),
                         'level', 'song_id', 'artist_id', col('sessionId'), 'location', col('userAgent'))
    
    
    songplays_table = songplays_table.join(time_table, songplays_table.start_time == time_table.timestamp, how="inner")\
                                     .select("songplayId", songplays_table.start_time, "userId", "level", "song_id", "artist_id", "sessionId", "location", "userAgent", "month", "year").drop_duplicates()

    
    
          
    #Using print statement to understand where in spark statement we are
    print("\n Writing parquet file for song paly table and partitioned by year and month....")       
    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.mode('overwrite').partitionBy("year", "month").parquet(output_data + 'songplays_table/')
def run_spark_job(spark):

    # TODO Create Spark Configuration
    # Create Spark configurations with max offset of 200 per trigger
    # set up correct bootstrap server and port
    df = spark \
        .readStream \
        .format("kafka")\
        .option("kafka.bootstrap.servers", "localhost:9092")\
        .option("subscribe", "org.sf.crime.calls")\
        .option("maxOffsetPerTrigger", 200)\
        .option("startingOffsets", "earliest") \
        .load()

    # Show schema for the incoming resources for checks
    df.printSchema()

    # TODO extract the correct column from the kafka input resources
    # Take only value and convert it to String
    kafka_df = df.selectExpr("CAST(value as STRING)")

    service_table = kafka_df\
        .select(psf.from_json(psf.col('value'), schema).alias("DF"))\
        .select("DF.*")

    # TODO select original_crime_type_name and disposition
    distinct_table = service_table\
        .select(
                psf.to_timestamp(psf.col("call_date_time")).alias("call_date_time")\
                , psf.col("original_crime_type_name")\
                , psf.col("disposition")
        )

    # count the number of original crime type
    agg_df = distinct_table\
        .select(
            distinct_table.call_date_time,
            distinct_table.original_crime_type_name,
            distinct_table.disposition)\
    .withWatermark("call_date_time","60 minutes")\
        .groupBy(
            psf.window(distinct_table.call_date_time, "10 minutes", "5 minutes"),
            psf.col("original_crime_type_name")
        )\
        .count()

    # TODO Q1. Submit a screen shot of a batch ingestion of the aggregation
    # TODO write output stream
    query = agg_df \
        .writeStream\
        .format("console")\
        .outputMode("Complete")\
        .start()

    # TODO attach a ProgressReporter
    query.awaitTermination()

    # TODO get the right radio code json path
    radio_code_json_filepath = "radio_code.json"
    radio_code_df = spark.read.json(radio_code_json_filepath)

    # clean up your data so that the column names match on radio_code_df and agg_df
    # we will want to join on the disposition code

    # TODO rename disposition_code column to disposition
    radio_code_df = radio_code_df.withColumnRenamed("disposition_code",
                                                    "disposition")

    # TODO join on disposition column
    join_query = agg_df\
                .join(radio_code_df, "disposition")\
                .writeStream\
                .format("console")\
                .queryName("join_query")\
                .start()

    join_query.awaitTermination()
示例#26
0
spark = SparkSession.builder.appName('Packt').getOrCreate()

# get the raw data from a local socket
raw_stream = spark.readStream.format('socket').option('host',
                                                      'localhost').option(
                                                          'port', 1234).load()

# set up the Twitter date-time format
tweet_datetime_format = 'EEE MMM dd HH:mm:ss ZZZZ yyyy'

# parse the json to get separate fields
tweet_stream = raw_stream.select(from_json('value', schema).alias('tweet'))

# create a timestamp by parsing the created_at field
timed_stream = tweet_stream.select(
    to_timestamp('tweet.created_at', tweet_datetime_format).alias('timestamp'),
    'tweet.text')

# To display the tweets without windowing:
# query = timed_stream.writeStream.outputMode('append').format('console').start()
# query.awaitTermination()

# create a sliding window of 1 minute with a slide of 10 seconds, with a 'slack time' of 2 seconds
windowed = timed_stream \
    .withWatermark('timestamp', '2 seconds') \
    .groupBy(window('timestamp', '1 minute', '10 seconds'))

# count the tweets per window
counts_per_window = windowed.count().orderBy('window')

# output the windows and counts to the console
def process_log_data(spark, input_data, output_data):
    """
    Uses a given Spark session to process log data.

    :param input_data: Path to a folder or S3 bucket, where the input data lives.
    :param output_data: Path to a folder or S3 bucket, where the output should be stored.
    """
    # get filepath to log data file
    log_data = os.path.join(input_data, 'log_data', '*.json')

    # read log data file
    logger.debug(f"Read data from {log_data}")
    df = spark.read.json(log_data)
    logger.debug('Auto detected JSON schema')
    df.printSchema()
    
    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong')

    # extract columns for users table    
    # Columns: user_id, first_name, last_name, gender, level
    artists_table = df[['userId', 'firstName', 'lastName', 'gender', 'level']]
    users_table = df.selectExpr([
        "userId as user_id", 
        "firstName as first_name", 
        "lastName as last_name", 
        "gender", 
        "level"
    ]).dropDuplicates()
    
    # write users table to parquet file
    users_table_fp = os.path.join(output_data, 'dim_user.parquet')
    logger.debug(f"Write users table: {users_table_fp}")
    users_table.write.parquet(users_table_fp, mode='overwrite')

    # create timestamp column from original timestamp column
    df = df.withColumn("timestamp", F.to_timestamp(df.ts/1000))  # time is in millisecond
    
    # create datetime column from original timestamp column
    df = df.withColumn("datetime", F.to_date(df.timestamp))
    
    # extract columns to create time table
    # Columns: start_time, hour, day, week, month, year, weekday
    time_table = df.selectExpr([
        "timestamp as start_time",
        "hour(datetime) as hour",
        "dayofmonth(datetime) as day",
        "weekofyear(datetime) as week",
        "month(datetime) as month",
        "year(datetime) as year",
        "dayofweek(datetime) as weekday",
    ])
    
    # write time table to parquet files partitioned by year and month
    time_table_fp = os.path.join(output_data, 'dim_time.parquet')
    logger.debug(f"Write time table: {time_table_fp}")
    time_table.write.parquet(time_table_fp, mode='overwrite', partitionBy=["year", "month"])

    # read in song data to use for songplays table
    song_table_fp = os.path.join(output_data, 'dim_song.parquet')
    song_df = spark.read.parquet(song_table_fp)
    song_df = song_df\
        .selectExpr([
            'song_id as song_song_id',
            'artist_id as song_artist_id',
            'title as song_title'
        ])

    # read in artist data to use for songplays table
    artist_table_fp = os.path.join(output_data, 'dim_artist.parquet')
    artist_df = spark.read.parquet(artist_table_fp)
    artist_df = artist_df.selectExpr([
        'artist_id as artist_artist_id', 
        'name as artist_name'
    ])

    # extract columns from joined song and log datasets to create songplays table 
    # Columns: songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent
    songplays_table = df.selectExpr([
            'timestamp as start_time',
            'userId as user_id',
            'level',
            'song',
            'artist',
            'sessionId as session_id',
            "location",
            'userAgent as user_agent'
        ])\
        .join(song_df, df.song==song_df.song_title, 'left_outer') \
        .join(artist_df, df.artist==artist_df.artist_name, 'left_outer') \
        .selectExpr([
            "start_time",
            "user_id", 
            "level",
            "song_song_id as song_id",
            "artist_artist_id as artist_id",
            "session_id",
            "location",
            "user_agent",
            "year(start_time) as year",
            "month(start_time) as month",
        ]) \
        .dropDuplicates() \
        .withColumn('songplay_id', F.monotonically_increasing_id())
        
    # write songplays table to parquet files partitioned by year and month
    songplays_table_fp = os.path.join(output_data, 'fact_songplay.parquet')
    logger.debug(f"Write songplay table: {songplays_table_fp}")
    songplays_table.write.parquet(songplays_table_fp, mode='overwrite', partitionBy=['year', 'month'])
示例#28
0
    else:
        dados_json = get(url, params={'limit': limit, 'offset': offset}).json()
        dados_json = dados_json['result']
        dados_json = dados_json['records']
    # Criando um dataframe com o dicionário anteriormente criado
    df_json = spark.createDataFrame(dados_json, schema)
    # Adicionando os dados do dataframe no dataframe vazio anteriormente Criando
    df_empty = df_empty.unionAll(df_json)
    # Incrementando a interação
    interaction += 1
    # Incrementando o offset
    offset = limit * interaction

# Alterando os campos de datas para timestamp
df = df_empty\
  .withColumn('LicenseAddDtTm',  to_timestamp('LicenseAddDtTm', 'yyyy-MM-dd HH:mm:ss'))

# Particionando dataframe para que grave apenas 10 arquivos
df = df.repartition(10)

# Gravar os dados no HDFS
df \
   .write\
 .mode("overwrite")\
  .option("path", hdfs + 'boston_active_food_establishment')\
  .saveAsTable("boston_active_food_establishment")

#df\
#    .write\
#    .mode("overwrite")\
#    .option("path",local + 'boston_active_food_establishment')\
示例#29
0
def main(args=None):
    """
    Pre process raw input data and save in
    cleansed state to /Processed directory
    """
    parser = build_parser()
    args = parser.parse_args(args)
    #ensure only one sc and spark instance is running
    global MySpark
    MySpark = MySpark or MyPySpark(master='local[3]')

    #make schema
    int_fields_l = []
    str_fields_l = [
        "copyright", "description", "end", "f", "geography", "iso3166", "lat",
        "latlon", "lon", "name", "source", "start", "units", "last_updated"
    ]
    str_fields_no_null_l = [
        "series_id",
    ]
    # timestamp_fields_l = ["last_updated",]
    str_fields_schema_l = [
        StructField(field_name, StringType(), nullable=True)
        for field_name in str_fields_l
    ]
    str_fields_no_null_schema_l = [
        StructField(field_name, StringType(), nullable=False)
        for field_name in str_fields_no_null_l
    ]
    array_fields_l = [
        StructField("data", ArrayType(ArrayType(StringType()))),
    ]
    # timestamp_schema_l = [StructField(field_name, StringType(), nullable=True) for field_name in timestamp_fields_l]
    electricity_schema = StructType(str_fields_schema_l +
                                    str_fields_no_null_schema_l +
                                    array_fields_l)
    #limit for testing
    electricity_raw_df = MySpark\
        .spark\
        .read\
        .json('/EIAElec/ELEC.json', schema = electricity_schema)

    electricity_raw_monthly_df = electricity_raw_df\
        .filter(pysF.col("f") == 'M')

    electricity_fact_df = MyPySpark.eia_data_explode(
        electricity_raw_monthly_df\
            .filter(pysF.col("series_id").isNotNull())\
            .select(
                "series_id",
                "data"))

    electricity_base_dim_df = electricity_raw_monthly_df\
        .drop("data", "latlon")\
        .filter(pysF.col("series_id").isNotNull())\
        .withColumn(
            "last_updated",
            pysF.to_timestamp("last_updated", "yyyy-MM-dd'T'HH:mm:ssXXX"))\
        .withColumn(
            "lat",
            pysF.col("lat").cast(pysT.DoubleType())
        )\
        .withColumn(
            "lon",
            pysF.col("lat").cast(pysT.DoubleType())
        )\
        .withColumn(
            "start",
            pysF.col("start").cast(pysT.IntegerType())
        )\
        .withColumn(
            "end",
            pysF.col("end").cast(pysT.IntegerType())
        )\
        .withColumn(
            "split_name",
            pysF.split("name", ":")
        )

    power_rows_l = [
        "^ELEC\.GEN\.", "^ELEC\.CONS_TOT.", "^ELEC\.CONS_TOT_BTU\.",
        "^ELEC\.CONS_EG\.", "^ELEC\.CONS_EG_BTU\.", "^ELEC\.CONS_UTO\.",
        "^ELEC\.CONS_UTO_BTU\."
    ]

    plant_level_rows_l = [
        "^ELEC\.PLANT\.GEN\.", "^ELEC\.PLANT\.CONS_TOT.",
        "^ELEC\.PLANT\.CONS_TOT_BTU\.", "^ELEC\.PLANT\.CONS_EG\.",
        "^ELEC\.PLANT\.CONS_EG_BTU\.", "^ELEC\.PLANT\.CONS_UTO\.",
        "^ELEC\.PLANT\.CONS_UTO_BTU\.", "^ELEC\.PLANT\.AVG_HEAT\."
    ]

    retail_rows_l = [
        "^ELEC\.SALES\.", "^ELEC\.REV\.", "^ELEC\.PRICE\.",
        "^ELEC\.CUSTOMERS\."
    ]

    fossil_fuel_rows_l = [
        "^ELEC\.STOCKS\.",
        "^ELEC\.RECEIPTS\.",
        "^ELEC\.RECEIPTS_BTU\.",
        "^ELEC\.COST\.",
        "^ELEC\.COST_BTU\.",
    ]

    fossil_fuel_quality_rows_l = [
        "^ELEC\.SULFUR_CONTENT\.", "^ELEC\.ASH_CONTENT\."
    ]

    electricity_power_dim_df = electricity_base_dim_df\
        .filter(
            pysF.col("series_id").rlike("|".join(power_rows_l))
        )\
        .withColumn(
            "value_type",
            pysF.trim(pysF.col("split_name").getItem(0))
        )\
        .withColumn(
            "fuel_type",
            pysF.trim(pysF.col("split_name").getItem(1))
        )\
        .withColumn(
            "region",
            pysF.trim(pysF.col("split_name").getItem(2))
        )\
        .withColumn(
            "sector",
            pysF.trim(pysF.col("split_name").getItem(3))
        )\
        .withColumn(
            "frequency",
            pysF.trim(pysF.col("split_name").getItem(4))
        )

    electricity_plant_level_dim_df = electricity_base_dim_df\
        .filter(
            pysF.col("series_id").rlike("|".join(plant_level_rows_l))
        )\
        .withColumn(
            "value_type",
            pysF.trim(pysF.col("split_name").getItem(0))
        )\
        .withColumn(
            "plant_name",
            pysF.trim(pysF.col("split_name").getItem(1))
        )\
        .withColumn(
            "fuel_type",
            pysF.trim(pysF.col("split_name").getItem(2))
        )\
        .withColumn(
            "engine_type",
            pysF.trim(pysF.col("split_name").getItem(3))
        )\
        .withColumn(
            "frequency",
            pysF.trim(pysF.col("split_name").getItem(4))
        )\
        .withColumn(
            "plant_id",
            pysF.regexp_extract(pysF.col("series_id"), r".*\.(\d+)-.*", 1)
        )

    electricity_retail_dim_df = electricity_base_dim_df\
        .filter(
            pysF.col("series_id").rlike("|".join(retail_rows_l))
        )\
        .withColumn(
            "value_type",
            pysF.trim(pysF.col("split_name").getItem(0))
        )\
        .withColumn(
            "region",
            pysF.trim(pysF.col("split_name").getItem(1))
        )\
        .withColumn(
            "sector",
            pysF.trim(pysF.col("split_name").getItem(2))
        )\
        .withColumn(
            "frequency",
            pysF.trim(pysF.col("split_name").getItem(3))
        )

    electricity_fossil_fuel_dim_df = electricity_base_dim_df\
        .filter(
            pysF.col("series_id").rlike("|".join(fossil_fuel_rows_l))
        )\
        .withColumn(
            "value_type",
            pysF.trim(pysF.col("split_name").getItem(0))
        )\
        .withColumn(
            "fuel_type",
            pysF.trim(pysF.col("split_name").getItem(1))
        )\
        .withColumn(
            "region",
            pysF.trim(pysF.col("split_name").getItem(2))
        )\
        .withColumn(
            "sector",
            pysF.trim(pysF.col("split_name").getItem(3))
        )\
        .withColumn(
            "frequency",
            pysF.trim(pysF.col("split_name").getItem(4))
        )

    electricity_fossil_fuel_quality_dim_df = electricity_base_dim_df\
        .filter(
            pysF.col("series_id").rlike("|".join(fossil_fuel_quality_rows_l))
        )\
        .withColumn(
            "value_type",
            pysF.trim(pysF.col("split_name").getItem(0))
        )\
        .withColumn(
            "quality_type",
            pysF.trim(pysF.col("split_name").getItem(1))
        )\
        .withColumn(
            "fuel_type",
            pysF.trim(pysF.col("split_name").getItem(2))
        )\
        .withColumn(
            "region",
            pysF.trim(pysF.col("split_name").getItem(3))
        )\
        .withColumn(
            "sector",
            pysF.trim(pysF.col("split_name").getItem(4))
        )\
        .withColumn(
            "frequency",
            pysF.trim(pysF.col("split_name").getItem(5))
        )\

    #Catch-all for any missed dimensions
    electricity_missed_dim_df = electricity_base_dim_df\
        .filter(
            ~pysF.col("series_id").rlike("|".join(
                    power_rows_l +
                    plant_level_rows_l +
                    retail_rows_l +
                    fossil_fuel_rows_l +
                    fossil_fuel_quality_rows_l))
        )

    # save plans to ExplainFiles, write to hdfs, and sync
    df_l = [
        {
            "df": electricity_fact_df,
            "description": "preprocess_electricity_facts",
            "path": "/Processed/ElectricityFactDF"
        },
        {
            "df": electricity_power_dim_df,
            "description": "preprocess_electricity_power_dimensions",
            "path": "/Processed/ElectricityPowerDimDF"
        },
        {
            "df": electricity_plant_level_dim_df,
            "description": "preprocess_electricity_plant_level_dimensions",
            "path": "/Processed/ElectricityPlantLevelDimDF"
        },
        {
            "df": electricity_retail_dim_df,
            "description": "preprocess_electricity_retail_dimensions",
            "path": "/Processed/ElectricityRetailDimDF"
        },
        {
            "df": electricity_fossil_fuel_dim_df,
            "description": "preprocess_electricity_fossil_fuel_dimensions",
            "path": "/Processed/ElectricityFossilFuelDimDF"
        },
        {
            "df": electricity_fossil_fuel_quality_dim_df,
            "description":
            "preprocess_electricity_fossil_fuel_quality_dimensions",
            "path": "/Processed/ElectricityFossilFuelQualityDimDF"
        },
        {
            "df": electricity_missed_dim_df,
            "description": "preprocess_electricity_missed_dimensions",
            "path": "/Processed/ElectricityMissedDimDF"
        },
    ]

    for df_d in df_l:
        #Common formatting
        if "split_name" in df_d["df"].columns:
            df_d["df"] = df_d["df"].drop("split_name")
        df_d["df"] = df_d["df"]\
            .replace(
                {
                    "":None,
                    "null":None
                })
        MySpark.eia_output_df(df_d=df_d,
                              display_output=args.display_test,
                              s3_backup=args.s3)
示例#30
0
# MAGIC %sql
# MAGIC SET spark.sql.legacy.timeParserPolicy = LEGACY

# COMMAND ----------

# MAGIC %md
# MAGIC # CORRECT

# COMMAND ----------

from pyspark.sql.functions import to_timestamp
display(
    df_date_tmp.withColumn(
        'created_at_date',
        to_timestamp(df_date_tmp.created_at_split,
                     'dd MMM yyyy HH:mm:ss')).select('created_at_split',
                                                     'created_at_date'))

# COMMAND ----------

from pyspark.sql.functions import date_format
display(
    df_date.withColumn('created_at_date',
                       date_format(df_date.created_at_split,
                                   'dd')).select('created_at_split',
                                                 'created_at_date'))

# COMMAND ----------

# MAGIC %sql
# MAGIC SELECT date_format('28 Jul 2020 01:29:45')

# COMMAND ----------

from pyspark.sql.functions import to_date
dateFormat = "yyyy-dd-MM"
cleanDateDF = spark.range(1).select(
    to_date(lit("2017-12-11"), dateFormat).alias("date"),
    to_date(lit("2017-20-12"), dateFormat).alias("date2"))
cleanDateDF.createOrReplaceTempView("dateTable2")


# COMMAND ----------

from pyspark.sql.functions import to_timestamp
cleanDateDF.select(to_timestamp(col("date"), dateFormat)).show()


# COMMAND ----------

from pyspark.sql.functions import coalesce
df.select(coalesce(col("Description"), col("CustomerId"))).show()


# COMMAND ----------

df.na.drop("all", subset=["StockCode", "InvoiceNo"])


# COMMAND ----------
示例#32
0
def main():

    spark = SparkSession \
          .builder \
          .appName("spark_streaming_app") \
          .getOrCreate()

    df = (spark.readStream.format('kafka').option(
        'kafka.bootstrap.servers',
        '104.248.248.196:9092,134.122.78.61:9092,134.209.225.2:9092').option(
            'subscribe', 'stream_data').option('startingOffsets',
                                               'earliest').load())

    df = df.selectExpr('CAST(value as STRING)')

    df = df.select(from_json(col('value'), data_schema).alias('df'))

    func1 = udf(lambda x: states[x.upper()], StringType())

    df = df.filter(col('df.group.group_country')=='us').select('df').withColumn('group_state', func1('df.group.group_state')).\
    withColumn('time', from_unixtime(col('df.event.time')/1000))

    df2 = df.select(
        struct(
            struct(
                col('df.event.event_name'),
                col('df.event.event_id'),
                col('time'),
            ).alias('event'), col('df.group.group_city'),
            col('df.group.group_country'), col('df.group.group_id'),
            col('df.group.group_name'), col('group_state')).alias('value'))

    stream2 = df2.select(to_json('value').alias('value')).writeStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers",'104.248.248.196:9092,134.122.78.61:9092,134.209.225.2:9092') \
        .option("topic", "US-meetups") \
        .option("checkpointLocation", "US-metups-checkpoint")

    stream2 = stream2.start()

    df3 = df.withColumn('timestamp', to_timestamp('time')).\
             withWatermark('timestamp', "1 minute").groupBy(window('timestamp', '1 minute')).\
             agg(struct(month('window.end').alias('month'), dayofmonth('window.end').alias('day_of_the_month'),
              hour('window.end').alias('hour'), minute('window.end').alias('minute'),collect_set('df.group.group_city').alias('cities')).alias('value')).\
             select('value')

    stream3 = df3.select(to_json('value').alias('value')).writeStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers",'104.248.248.196:9092,134.122.78.61:9092,134.209.225.2:9092') \
        .option("topic", "US-cities-every-minute") \
        .option("checkpointLocation", "US-cities-every-minute-checkpoint")

    stream3 = stream3.start()

    df4 = df.select(
        struct(
            struct(
                col('df.event.event_name'),
                col('df.event.event_id'),
                col('time'),
            ).alias('event'), col('df.group.group_topics.topic_name'),
            col('df.group.group_city'), col('df.group.group_country'),
            col('df.group.group_id'), col('df.group.group_name'),
            col('group_state')).alias('value')).filter(
                arrays_overlap(
                    'value.topic_name',
                    array(lit("Computer programming"), lit("Big Data"),
                          lit("Machine Learning"), lit("Python"), lit("Java"),
                          lit("Web Development"))))

    stream4 = df4.select(to_json('value').alias('value')).writeStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers",'104.248.248.196:9092,134.122.78.61:9092,134.209.225.2:9092') \
        .option("topic", "Programming-meetups") \
        .option("checkpointLocation", "Programming-metups-checkpoint")

    stream4 = stream4.start()

    stream4.awaitTermination()

    spark.stop()