conf = SparkConf().setAppName("pyspark") sc = SparkContext(conf=conf) sqlContext = HiveContext(sc) #filter vic jak 5 adres nebo vic jak 3 porty na jdne IP #get data data = sqlContext.read.parquet("CTU-Flows_main/Flows.parquet/_yyyymd=2018-3-7") #filter flows with dstIP outside of the university and srcIP inside range 80-83 mask 22 df = data.filter(data.Proto=="tcp").filter(data.DstAddr.startswith("147.32.")).filter(~data.SrcAddr.startswith("147.32.")).select("DstAddr", "Dport", "State", "StartTime", "SrcAddr") touchAddrLimit = 5 touchPortLimit = 3 #predelat typy IP, port #select day from timestamp df = df.withColumn('Dport', df["Dport"].cast(T.IntegerType())) df = df.withColumn('day', unix_timestamp('StartTime', 'yyyy/MM/dd').cast(T.TimestampType())) df = df.withColumn('timestamp', unix_timestamp('StartTime', 'yyyy/MM/dd hh:mm:ss.SSSSSS').cast(T.TimestampType())) srcAddrs = df.select('SrcAddr','DstAddr','Dport','day').distinct().groupBy('SrcAddr', 'day').agg(F.countDistinct('DstAddr').alias('addrCount'), F.countDistinct('Dport').alias('portCount')).filter((col('addrCount')>=touchAddrLimit) | (col('portCount')>=touchPortLimit)).select('SrcAddr',"day") df = df.join(srcAddrs, ['SrcAddr','day'], 'leftsemi') #distinct SrcAddr and DSTport res = df.select("SrcAddr", "day","Dport")distinct().groupBy("day", "Dport").count().selectExpr("day","Dport","count as connectionCount") result = res.groupBy(df.Dport).agg(F.avg('connectionCount').alias("average_count")).select("Dport","average_count").sort(col('average_count').desc()).head(100) print("********RESULTS*************") for row in result: print("{},{}".format(row["Dport"],row["average_count"])) print("********RESULTS END*************")
df_convert_result = df_base .withColumn("today", F.lit("today")) # In[ ]: """ DataFrameのカラム型変換 """ # Date df_convert_result1 = df_base .withColumn("tdate", F.lit(str_yyyymmdd_to_date(target_date))) .withColumn("tdate", F.lit(F.col("tdate").cast("date"))) # Timestamp df_wakati_result2 = df_base .withColumn("created_at", df_wakati_base.created_at.cast(T.TimestampType())) # In[ ]: # -------------- BigQuery Connector -------------- # In[ ]: """ dataproc-mecab-init-shellで作成した環境では、BigQueryコネクタをinstallしているため、 BigQueryの読み込み、書き込みができる。 https://cloud.google.com/dataproc/docs/tutorials/bigquery-connector-spark-example?hl=ja
def get_spark_testing_client(data_directory): global _spark_testing_client if _spark_testing_client is not None: return _spark_testing_client pytest.importorskip('pyspark') import pyspark.sql.types as pt _spark_testing_client = ibis.spark.connect() s = _spark_testing_client._session df_functional_alltypes = s.read.csv( path=str(data_directory / 'functional_alltypes.csv'), schema=pt.StructType([ pt.StructField('index', pt.IntegerType(), True), pt.StructField('Unnamed: 0', pt.IntegerType(), True), pt.StructField('id', pt.IntegerType(), True), # cast below, Spark can't read 0/1 as bool pt.StructField('bool_col', pt.ByteType(), True), pt.StructField('tinyint_col', pt.ByteType(), True), pt.StructField('smallint_col', pt.ShortType(), True), pt.StructField('int_col', pt.IntegerType(), True), pt.StructField('bigint_col', pt.LongType(), True), pt.StructField('float_col', pt.FloatType(), True), pt.StructField('double_col', pt.DoubleType(), True), pt.StructField('date_string_col', pt.StringType(), True), pt.StructField('string_col', pt.StringType(), True), pt.StructField('timestamp_col', pt.TimestampType(), True), pt.StructField('year', pt.IntegerType(), True), pt.StructField('month', pt.IntegerType(), True), ]), mode='FAILFAST', header=True, ) df_functional_alltypes = df_functional_alltypes.withColumn( "bool_col", df_functional_alltypes["bool_col"].cast("boolean")) df_functional_alltypes.createOrReplaceTempView('functional_alltypes') df_batting = s.read.csv( path=str(data_directory / 'batting.csv'), schema=pt.StructType([ pt.StructField('playerID', pt.StringType(), True), pt.StructField('yearID', pt.IntegerType(), True), pt.StructField('stint', pt.IntegerType(), True), pt.StructField('teamID', pt.StringType(), True), pt.StructField('lgID', pt.StringType(), True), pt.StructField('G', pt.IntegerType(), True), pt.StructField('AB', pt.DoubleType(), True), pt.StructField('R', pt.DoubleType(), True), pt.StructField('H', pt.DoubleType(), True), pt.StructField('X2B', pt.DoubleType(), True), pt.StructField('X3B', pt.DoubleType(), True), pt.StructField('HR', pt.DoubleType(), True), pt.StructField('RBI', pt.DoubleType(), True), pt.StructField('SB', pt.DoubleType(), True), pt.StructField('CS', pt.DoubleType(), True), pt.StructField('BB', pt.DoubleType(), True), pt.StructField('SO', pt.DoubleType(), True), pt.StructField('IBB', pt.DoubleType(), True), pt.StructField('HBP', pt.DoubleType(), True), pt.StructField('SH', pt.DoubleType(), True), pt.StructField('SF', pt.DoubleType(), True), pt.StructField('GIDP', pt.DoubleType(), True), ]), header=True, ) df_batting.createOrReplaceTempView('batting') df_awards_players = s.read.csv( path=str(data_directory / 'awards_players.csv'), schema=pt.StructType([ pt.StructField('playerID', pt.StringType(), True), pt.StructField('awardID', pt.StringType(), True), pt.StructField('yearID', pt.IntegerType(), True), pt.StructField('lgID', pt.StringType(), True), pt.StructField('tie', pt.StringType(), True), pt.StructField('notes', pt.StringType(), True), ]), header=True, ) df_awards_players.createOrReplaceTempView('awards_players') df_simple = s.createDataFrame([(1, 'a')], ['foo', 'bar']) df_simple.createOrReplaceTempView('simple') df_struct = s.createDataFrame([((1, 2, 'a'), )], ['struct_col']) df_struct.createOrReplaceTempView('struct') df_nested_types = s.createDataFrame([( [1, 2], [[3, 4], [5, 6]], { 'a': [[2, 4], [3, 5]] }, )], [ 'list_of_ints', 'list_of_list_of_ints', 'map_string_list_of_list_of_ints' ]) df_nested_types.createOrReplaceTempView('nested_types') df_complicated = s.createDataFrame([({ (1, 3): [[2, 4], [3, 5]] }, )], ['map_tuple_list_of_list_of_ints']) df_complicated.createOrReplaceTempView('complicated') df_udf = s.createDataFrame([ ('a', 1, 4.0, 'a'), ('b', 2, 5.0, 'a'), ('c', 3, 6.0, 'b'), ], ['a', 'b', 'c', 'key']) df_udf.createOrReplaceTempView('udf') df_udf_nan = s.createDataFrame( pd.DataFrame({ 'a': np.arange(10, dtype=float), 'b': [3.0, np.NaN] * 5, 'key': list('ddeefffggh'), })) df_udf_nan.createOrReplaceTempView('udf_nan') df_udf_null = s.createDataFrame( [(float(i), None if i % 2 else 3.0, 'ddeefffggh'[i]) for i in range(10)], ['a', 'b', 'key']) df_udf_null.createOrReplaceTempView('udf_null') df_udf_random = s.createDataFrame( pd.DataFrame({ 'a': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), 'b': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), 'key': list('ddeefff'), })) df_udf_random.createOrReplaceTempView('udf_random') return _spark_testing_client
def process_log_data(spark, input_data, output_data): """Function that processes log data into tables Args: spark (SparkSession): SparkSession input_data(str): path to input storage bucket output_data(str): path to output storage bucket Returns: None """ # get filepath to log data file # log_data = inpuut_data + 'log_data/*/*/*.json' log_data = input_data + 'log_data/2018/11/2018-11*.json' # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df[df['page'] == 'NextSong'] #df.printSchema() df.createOrReplaceTempView("temp_data") # extract columns for users table users_table = spark.sql(''' SELECT DISTINCT userId, firstName, lastName, gender, level FROM temp_data WHERE userId not IN ('') ''') # write users table to parquet files users_table.limit(5).write.parquet(os.path.join(output_data, 'users')) df = df.filter(df.ts.isNotNull()) # create timestamp column from original timestamp column get_timestamp = udf( lambda x: datetime.fromtimestamp((x / 1000.0)) if x != '' else '', t.TimestampType()) df = df.withColumn('start_time', get_timestamp(df.ts)) # extract columns to create time table df=df.dropDuplicates()\ .withColumn("hour", hour("start_time"))\ .withColumn("day", dayofmonth("start_time"))\ .withColumn("week", weekofyear("start_time"))\ .withColumn("month", month("start_time"))\ .withColumn("year", year("start_time"))\ .withColumn("weekday", dayofweek("start_time")) time_table = df.select( ['start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday']) # write time table to parquet files partitioned by year and month time_table.limit(5).write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'time')) # read in song data to use for songplays table songs_parquet = output_data + 'songs' songs_df = spark.read.parquet(songs_parquet) artists_parquet = output_data + 'artists/*.parquet' artists_df = spark.read.parquet(artists_parquet) songs_df = songs_df.join(artists_df, ['artist_id']) songs_df = songs_df.drop('year') df = df.join(songs_df, ((df.song == songs_df.song_id) & (df.artist == songs_df.artist_name)), how='left') df.createOrReplaceTempView("temp_data") # extract columns from joined song and log datasets to create songplays table songplays_table = spark.sql(''' SELECT year, month, start_time, userID, level, song_id, artist_id, sessionId, location, userAgent FROM temp_data ''') # sort by ts and use row number for songplay_id window = Window.orderBy(col('start_time')) songplays_table = songplays_table.withColumn('songplay_id', row_number().over(window)) # write songplays table to parquet files partitioned by year and month songplays_table.limit(10).write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'song_plays'))
KAFKA_BROKER_URL = os.environ.get("KAFKA_BROKER_URL", "localhost:9092") KAFKA_TOPIC = "udacity.project.spark-streaming.police" schema = pst.StructType([ pst.StructField("crime_id", pst.StringType()), # : "183653763", pst.StructField("original_crime_type_name", pst.StringType()), # : "Traffic Stop", pst.StructField("report_date", pst.DateType()), # : "2018-12-31T00:00:00.000", pst.StructField("call_date", pst.DateType()), # : "2018-12-31T00:00:00.000", pst.StructField("offense_date", pst.DateType()), # : "2018-12-31T00:00:00.000", pst.StructField("call_time", pst.StringType()), # : "23:57", pst.StructField("call_date_time", pst.TimestampType()), # : "2018-12-31T23:57:00.000", pst.StructField("disposition", pst.StringType()), # : "ADM", pst.StructField("address", pst.StringType()), # : "Geary Bl/divisadero St", pst.StructField("city", pst.StringType()), # : "San Francisco", pst.StructField("state", pst.StringType()), # : "CA", pst.StructField("agency_id", pst.StringType()), # : "1", pst.StructField("address_type", pst.StringType()), # : "Intersection", pst.StructField("common_location", pst.StringType()), # : "" ]) def run_spark_job(spark): df = ( spark.readStream.format("kafka").option(
def process_log_data(spark, input_data, output_data): """ Processes log data and writes the users, the time, and the songplays table into specified S3 bucket in parquet format. Parameters ------- spark: object Spark Session object to handle the Spark Processes input_data: str The location of the files to read from S3 Bucket output_data: str The location of the files to write into S3 Bucket """ # get filepath to log data file log_data = input_data + "log_data/*/*/*events.json" # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df.page == "NextSong") # extract columns for users table users_cols = [ "userId as user_id", "firstName as first_name", "lastName as last_name", "gender", "level" ] users_table = df.selectExpr(users_cols).drop_duplicates() # write users table to parquet files users_table.write.parquet(output_data + "users/") # create datetime column from original timestamp column get_datetime = F.udf(lambda x: D.fromtimestamp(int(x / 1000)), T.TimestampType()) df = df.withColumn("start_time", get_datetime("ts")) # extract columns to create time table time_table = df.select("start_time") \ .withColumn("hour", F.hour("start_time")) \ .withColumn("day", F.dayofmonth("start_time")) \ .withColumn("week", F.weekofyear("start_time")) \ .withColumn("month", F.month("start_time")) \ .withColumn("year", F.year("start_time")) \ .withColumn("weekday", F.date_format('start_time', 'EEEE')).drop_duplicates() # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year", "month").parquet(output_data + 'time/') # read in song data to use for songplays table songs_table = spark.read.parquet(output_data + "songs/") songs_table = songs_table.selectExpr( ["song_id", "artist_id as s_artist_id", "title"]) artists_table = spark.read.parquet(output_data + "artists/") artists_table = artists_table.select(["artist_id", "location", "name"]) song_df = songs_table.join( artists_table, songs_table.s_artist_id == artists_table.artist_id, "inner") cols = [ "start_time", "userId as user_id", "level", "sessionId as session_id", "userAgent as user_agent", "song", "artist" ] df = df.selectExpr(cols) \ .withColumn("songplay_id", F.monotonically_increasing_id()) \ .withColumn("month", F.month("start_time")) \ .withColumn("year", F.year("start_time")) df = df.join(song_df, (df.song == song_df.title) & (df.artist == song_df.name), "left") # extract columns from joined song and log datasets to create songplays table songplays_cols = [ "songplay_id", "start_time", "user_id", "level", "song_id", "artist_id", "session_id", "location", "user_agent", "year", "month" ] songplays_table = df.select(songplays_cols) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year", "month").parquet(output_data + 'songplays/')
def process_log_data(spark, input_data, output_data): """ Process log data, i.e. read the log files into Spark and transform into OLAP like user, time (dimensions) tables and songplay (fact) table. Store the fact and dimension tables into Spark parquet files. Parameters ---------- spark : The Spark session to use input_data : string The path to the input log files (in S3) output_data : string The path to where the generated parquet files will be stored """ # get filepath to log data file song_data_path = input_data + 'song_data' + '/*/*/*/*.json' log_data = input_data + 'log_data' + '/*/*/*.json' event_df = spark.read.json(log_data) # filter by actions for song plays event_df = event_df.filter(event_df.page == 'NextSong') # extract columns for users table user_table = event_df.select('userId', 'firstName', 'lastName', \ 'gender', 'level', ) \ .dropDuplicates() # write users table to parquet files user_table.write.mode('overwrite').parquet(output_data + '/user_table') # create timestamp column from original timestamp column def get_ts (ts): return datetime.fromtimestamp(ts / 1000.0) get_datetime = udf(lambda z: get_ts(z), t.TimestampType()) datetime_df = event_df.select('ts').dropDuplicates() \ .withColumn('datetime', get_datetime('ts')) # extract columns to create time table time_table = datetime_df.select(col('ts').alias('starttime'), hour('datetime').alias('hour'), dayofmonth('datetime').alias('day'), weekofyear('datetime').alias('week'), month('datetime').alias('month'), year('datetime').alias('year'), dayofweek('datetime').alias('weekday')) # write time table to parquet files partitioned by year and month time_table.write.mode("overwrite") \ .partitionBy("year", "month") \ .parquet('s3://data-lake-sparkify/time_table') # read in song data again to use for creating songplays table # Would probably have been a good idea to keep the dataframe # created in process_song_data, but the template seems to # dictate that it should be read again... song_df = spark.read.json(song_data_path) # extract columns from joined song and log datasets to create # songplays table songplays_table = event_df.join(song_df, \ event_df.artist == song_df.artist_name) \ .select('ts', 'userId', 'song_id', \ 'artist_id', 'level', 'sessionId',\ 'location', 'userAgent') \ .dropDuplicates() # write songplays table to parquet files # partitioned by year and month (which does not make sense since it is # not in table, I will use song_id and artist_id instead). songplays_table.write.mode("overwrite") \ .partitionBy("song_id", "artist_id") \ .parquet('s3://data-lake-sparkify/songplay_table')
def process_data(data_path, database, table): ''' Purpose: To Process the API results using SPARK and store in HDFS ''' try: logger('INFO', "LOADING the Data in Spark for Processing ") logger( 'INFO', "In case of java.lang.OutOfMemoryError Tune Spark Parameters in conf/proprties.yml" ) df = myspark.read.format("json").options( inferSchema=True, dateFormat="yyyy-MM-dd", timestampFormat="yyyy-MM-dd'T'HH:mm:ss.SSSZZ", ignoreLeadingWhiteSpace=True, ignoreTrailingWhiteSpace=True, path="/tmp/tempfiles/").load() logger('INFO', "Changing the Data Type to Timestamp for few Columns") df = df.withColumn( "created", F.from_utc_timestamp( df.created, "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'").cast( T.TimestampType())).withColumn( "lastModified", F.from_utc_timestamp( df.lastModified, "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'").cast( T.TimestampType())).withColumn( "lastAccessed", F.from_utc_timestamp( df.lastAccessed, "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'").cast( T.TimestampType())) df = df.withColumn( "created", F.date_format(df.created, "yyyy-MM-dd HH:mm:ss")).withColumn( "lastModified", F.date_format(df.lastModified, "yyyy-MM-dd HH:mm:ss")).withColumn( "lastAccessed", F.date_format(df.lastAccessed, "yyyy-MM-dd HH:mm:ss")) logger('INFO', "Choosing the 11 required Columns ") finaldf = df.select("owner", "parentPath", "originalName", "created", "lastModified", "lastAccessed", "size", "sourceType", "permissions", "group", "identity") logger( 'INFO', "%s Records will be LOADED into %s.%s Table " % (finaldf.count(), database, table)) logger('INFO', "Create Table Command Prepared:") cmd = "CREATE TABLE IF NOT EXISTS %s.%s (owner STRING,parentPath STRING,originalName STRING,created timestamp,lastModified timestamp,lastAccessed timestamp,size BIGINT,sourceType STRING,permissions STRING,group STRING,identity BIGINT)STORED AS PARQUET" % ( database, table) logger('INFO', cmd) logger('INFO', "Creating the Table %s.%s if NOT EXISTS" % (database, table)) myspark.sql(cmd) finaldf.createOrReplaceTempView("mytable") logger('INFO', "INSERT INTO TABLE Table Command Prepared:") cmd = """INSERT INTO TABLE {database}.{table} SELECT owner,parentPath,originalName,created,lastModified,lastAccessed,size,sourceType,permissions,group,identity FROM mytable""".format( database=database, table=table) logger('INFO', cmd) logger('INFO', "Inserting the Data in %s.%s Table" % (database, table)) myspark.sql(cmd) logger( 'INFO', "%s Records Inserted successfully in %s.%s" % (finaldf.count(), database, table)) except Exception, e: logger('ERROR', "Job Failed with below Details !!") os.system("rm -f " + data_path + "/../pids/*.pid") logger('ERROR', "%s" % e) logger('ERROR', "Exiting !!") sys.exit()
def process_log_data(spark, input_data_ld, input_data_sd, output_data, \ run_start_time): """ Load JSON input data (log_data) from input_data path, process the data to extract users_table, time_table, songplays_table, and store the queried data to parquet files. """ print("Start processing log_data JSON files...") # get filepath to log data file log_data = input_data_ld # read log data file print("Reading log_data files from {}...".format(log_data)) df_ld = spark.read.json(log_data).dropDuplicates() print("...finished reading log_data...") # filter by actions for song plays df_ld_filtered = df_ld.filter(df_ld.page == 'NextSong') # extract columns for users table df_ld_filtered.createOrReplaceTempView("users_table_DF") users_table = spark.sql(""" SELECT DISTINCT userId AS user_id, firstName AS first_name, lastName AS last_name, gender, level FROM users_table_DF ORDER BY last_name """) print("Users_table schema:") users_table.printSchema() print("Users_table examples:") users_table.show(5) # write users table to parquet files users_table_path = output_data + "users_table.parquet" + "_" \ + run_start_time print("Writing users_table parquet files to {}..."\ .format(users_table_path)) users_table.write.mode("overwrite").parquet(users_table_path) print("...finished writing users_table...") # create timestamp column from original timestamp column print("Creating timestamp column...") @udf(t.TimestampType()) def get_timestamp(ts): return datetime.fromtimestamp(ts / 1000.0) df_ld_filtered = df_ld_filtered.withColumn("timestamp", \ get_timestamp("ts")) df_ld_filtered.printSchema() df_ld_filtered.show(5) # create datetime column from original timestamp column print("Creating datetime column...") @udf(t.StringType()) def get_datetime(ts): return datetime.fromtimestamp(ts / 1000.0)\ .strftime('%Y-%m-%d %H:%M:%S') df_ld_filtered = df_ld_filtered.withColumn("datetime", \ get_datetime("ts")) print("Log_data + timestamp + datetime columns schema:") df_ld_filtered.printSchema() print("Log_data + timestamp + datetime columns examples:") df_ld_filtered.show(5) # extract columns to create time table df_ld_filtered.createOrReplaceTempView("time_table_DF") time_table = spark.sql(""" SELECT DISTINCT datetime AS start_time, hour(timestamp) AS hour, day(timestamp) AS day, weekofyear(timestamp) AS week, month(timestamp) AS month, year(timestamp) AS year, dayofweek(timestamp) AS weekday FROM time_table_DF ORDER BY start_time """) print("Time_table schema:") time_table.printSchema() print("Time_table examples:") time_table.show(5) # write time table to parquet files partitioned by year and month time_table_path = output_data + "time_table.parquet" + "_" \ + run_start_time print("Writing time_table parquet files to {}..."\ .format(time_table_path)) time_table.write.mode("overwrite").partitionBy("year", "month")\ .parquet(time_table_path) print("...finished writing time_table...") # read in song data to use for songplays table song_data = input_data_sd print("Reading song_data files from {}...".format(song_data)) df_sd = spark.read.json(song_data) # Join log_data and song_data DFs print("Joining log_data and song_data DFs...") df_ld_sd_joined = df_ld_filtered\ .join(df_sd, (df_ld_filtered.artist == df_sd.artist_name) & \ (df_ld_filtered.song == df_sd.title)) print("...finished joining song_data and log_data DFs.") print("Joined song_data + log_data schema:") df_ld_sd_joined.printSchema() print("Joined song_data + log_data examples:") df_ld_sd_joined.show(5) # extract columns from joined song and log datasets # to create songplays table print("Extracting columns from joined DF...") df_ld_sd_joined = df_ld_sd_joined.withColumn("songplay_id", \ monotonically_increasing_id()) df_ld_sd_joined.createOrReplaceTempView("songplays_table_DF") songplays_table = spark.sql(""" SELECT songplay_id AS songplay_id, timestamp AS start_time, userId AS user_id, level AS level, song_id AS song_id, artist_id AS artist_id, sessionId AS session_id, location AS location, userAgent AS user_agent FROM songplays_table_DF ORDER BY (user_id, session_id) """) print("Songplays_table schema:") songplays_table.printSchema() print("Songplays_table examples:") songplays_table.show(5, truncate=False) # write songplays table to parquet files partitioned by year and month songplays_table_path = output_data + "songplays_table.parquet" + "_" \ + run_start_time print("Writing songplays_table parquet files to {}..."\ .format(songplays_table_path)) time_table.write.mode("overwrite").partitionBy("year", "month")\ .parquet(songplays_table_path) print("...finished writing songplays_table...") return users_table, time_table, songplays_table
def process_log_data(spark, input_data, output_data): """ The function to process song data Parameters: spark : The Spark session that will be used to execute commands. input_data : The input data to be processed. output_data : The location where to store the parquet tables. """ # get filepath to log data file log_data = input_data # read log data file df_log = spark.read.json(input_data) # filter by actions for song plays df_log = df_log.filter(F.col("page") == "NextSong") # Extract columns for users table users_cols = [user_id, first_name, last_name, gender, level] # remove duplicate rows users_table_df = df_log.select(users_cols).dropDuplicates() # write users table to parquet files users_table_df.write.parquet(output_data + 'users_table', mode='Overwrite') # define functions for extracting time components from ts field get_timestamp = F.udf(lambda x: datetime.fromtimestamp((x / 1000.0)), T.TimestampType()) get_hour = F.udf(lambda x: x.hour, T.IntegerType()) get_day = F.udf(lambda x: x.day, T.IntegerType()) get_week = F.udf(lambda x: x.isocalendar()[1], T.IntegerType()) get_month = F.udf(lambda x: x.month, T.IntegerType()) get_year = F.udf(lambda x: x.year, T.IntegerType()) get_weekday = F.udf(lambda x: x.weekday(), T.IntegerType()) # create timestamp column from original timestamp column df_log = df_log.withColumn("timestamp", get_timestamp(df_log.ts)) df_log = df_log.withColumn("hour", get_hour(df_log.timestamp)) df_log = df_log.withColumn("day", get_day(df_log.timestamp)) df_log = df_log.withColumn("week", get_week(df_log.timestamp)) df_log = df_log.withColumn("month", get_month(df_log.timestamp)) df_log = df_log.withColumn("year", get_year(df_log.timestamp)) df_log = df_log.withColumn("weekday", get_weekday(df_log.timestamp)) # extract columns to create time table time_cols = [start_time, hour, day, week, month, year, weekday] time_table_df = df_log.select(time_cols) # write time table to parquet files partitioned by year and month time_table_df.write.parquet(output_data + 'time_table', partitionBy=['year', 'month'], mode='Overwrite') # read in song data to use for songplays table # read the partitioned data df_artists_read = spark.read.option( "mergeSchema", "true").parquet(output_data + "artists_table") df_songs_read = spark.read.option( "mergeSchema", "true").parquet(output_data + "songs_table") # extract columns from joined song and log datasets to create songplays table songplay_cols = [ start_time, user_id, song_id, artist_id, session_id, locationSP, user_agent, level, month, year ] # join artists and songs so that we can join this table in the next step df_joined_songs_artists = df_songs_read.join( df_artists_read, 'artist_id').select("artist_id", "song_id", "title", "artist_name") # join df_log with the earlier joined artist and songs table songplay_table_df = df_log.join( df_joined_songs_artists, df_log.artist == df_joined_songs_artists.artist_name).select(songplay_cols) # create songplay_id songplay_table_df = songplay_table_df.withColumn( "songplay_id", F.monotonically_increasing_id()) # write songplays table to parquet files partitioned by year and month songplay_table_df.write.parquet(output_data + 'songplays_table', partitionBy=['year', 'month'], mode='Overwrite')
def main(base_path): spark = SparkSession.builder.config("spark.default.parallelism", 1).appName(APP_NAME).getOrCreate() # # Load all models to be used in making predictions # # Load the arrival delay bucketizer from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # Load all the string field vectorizer pipelines into a dict from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column) string_indexer_model = StringIndexerModel.load( string_indexer_model_path) string_indexer_models[column] = string_indexer_model # Load the numeric vector assembler from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # Load the classifier model from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path) rfc = RandomForestClassificationModel.load(random_forest_model_path) # # Messages look like: # # { # "Carrier": "DL", # "DayOfMonth": 25, # "DayOfWeek": 4, # "DayOfYear": 359, # "DepDelay": 10.0, # "Dest": "LAX", # "Distance": 2475.0, # "FlightDate": "2015-12-25", # "FlightNum": null, # "Origin": "JFK", # "Timestamp": "2019-10-31T00:19:47.633280", # "UUID": "af74b096-ecc7-4493-a79a-ebcdff699385" # } # # Process Prediction Requests from Kafka # message_df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", BROKERS) \ .option("subscribe", PREDICTION_TOPIC) \ .load() # Create a DataFrame out of the one-hot encoded RDD schema = T.StructType([ T.StructField("Carrier", T.StringType()), T.StructField("DayOfMonth", T.IntegerType()), T.StructField("DayOfWeek", T.IntegerType()), T.StructField("DayOfYear", T.IntegerType()), T.StructField("DepDelay", T.FloatType()), T.StructField("Dest", T.StringType()), T.StructField("Distance", T.FloatType()), T.StructField("FlightDate", T.StringType()), T.StructField("FlightNum", T.StringType()), T.StructField("Origin", T.StringType()), T.StructField("Timestamp", T.TimestampType()), T.StructField("UUID", T.StringType()), ]) prediction_requests_df = message_df.select( F.from_json(F.col("value").cast("string"), schema).alias("data")).select("data.*") # # Add a Route variable to replace FlightNum # prediction_requests_with_route = prediction_requests_df.withColumn( 'Route', F.concat(prediction_requests_df.Origin, F.lit('-'), prediction_requests_df.Dest)) # Vectorize string fields with the corresponding pipeline for that column # Turn category fields into categoric feature vectors, then drop intermediate fields for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform( prediction_requests_with_route) # Vectorize numeric columns: DepDelay, Distance and index columns final_vectorized_features = vector_assembler.transform( prediction_requests_with_route) # Drop the individual index columns index_columns = [ "Carrier_index", "Origin_index", "Dest_index", "Route_index" ] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Make the prediction predictions = rfc.transform(final_vectorized_features) # Drop the features vector and prediction metadata to give the original fields predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop( "rawPrediction").drop("probability") # Store the results to MongoDB class MongoWriter: def open(self, partition_id, epoch_id): print(f"Opened partition id: {partition_id}, epoch: {epoch_id}") self.mongo_client = pymongo.MongoClient() print(f"Opened MongoClient: {self.mongo_client}") return True def process(self, row): print(f"Processing row: {row}") as_dict = row.asDict() print(f"Inserting row.asDict(): {as_dict}") id = self.mongo_client.agile_data_science.flight_delay_classification_response.insert_one( as_dict) print(f"Inserted row, got ID: {id.inserted_id}") self.mongo_client.close() return True def close(self, error): print("Closed with error: %s" % str(error)) return True query = final_predictions.writeStream.foreach(MongoWriter()).start() query.awaitTermination()
listOfCompanies = [ 'pixel', 'iphone', 'samsung', 'huawei', 'xiaomi', 'vivo', 'oppo', 'motorola', 'realme', 'sony', 'oneplus' ] fields = ("videoId", "channelId", "date", "mobileCompany", "views", "commnets", "likes", "dislikes") video = namedtuple("video", fields) channels = spark.read.csv("users.csv", header=True, inferSchema=True).drop('_c0') video_schema = st.StructType([ st.StructField("videoId", st.StringType(), True), st.StructField("channelId", st.StringType(), True), st.StructField("creationDate", st.TimestampType(), True), st.StructField("mobileCompany", st.StringType(), True), st.StructField("views", st.IntegerType(), True), st.StructField("comments", st.IntegerType(), True), st.StructField("likes", st.IntegerType(), True), st.StructField("dislikes", st.IntegerType(), True) ]) def getCompany(t): t = t.lower() t = re.sub('[^a-z ]+', '', t) t = t.split() t = list(filter(lambda word: word in listOfCompanies, t)) if len(t) > 0: return t[0]
def process_log_data(spark, input_data, output_data): """Process log data and creates users table, time table and songplays table""" # get filepath to log data file log_data = input_data + "log_data" # read log data file df = spark.read.json(log_data) # filter by actions for song plays # note: song plays needs all 'NextSong' results df = df.filter(df.page == 'NextSong') # extract columns for users table users_table = df[['userId', 'firstName', 'lastName', 'gender', 'level']] # write users table to parquet files users_table.dropDuplicates().write.parquet(output_data + "user.parquet") # create timestamp column from original timestamp column get_datetime = udf(lambda x: datetime.fromtimestamp((x / 1000.0)), T.TimestampType()) df = df.withColumn("start_time", get_datetime(df.ts)) # create table to extract columns df.createOrReplaceTempView("log_staging_table") # create columns for time table time_table = spark.sql(''' SELECT start_time, EXTRACT(hour from start_time) as hour, EXTRACT(day from start_time) as day, EXTRACT(week from start_time) as week, EXTRACT(month from start_time) as month, EXTRACT(year from start_time) as year, DAYOFWEEK(start_time) as weekday from log_staging_table ''').collect() # write time table to parquet files partitioned by year and month time_table_dataframe = spark.createDataFrame(time_table) time_table_dataframe.dropDuplicates().write.partitionBy( "year", "month").parquet(output_data + "time.parquet") # read in song data to use for songplays table song_data = input_data + "song_data" song_df = spark.read.json(song_data + "/*/*/*") song_df.createOrReplaceTempView("songs_staging_table") # extract columns from joined song and log datasets to create songplays table temp_table = spark.sql(''' SELECT a.start_time, a.userId, a.level, b.song_id, b.artist_id, a.sessionId, a.location, a.userAgent, EXTRACT(month from a.start_time) as month, EXTRACT(year from a.start_time) as year from log_staging_table a inner join songs_staging_table b on a.song = b.title ''').collect() songplays_table = spark.createDataFrame(temp_table) # write songplays table to parquet files partitioned by year and month songplays_table.dropDuplicates().write.partitionBy( "year", "month").parquet(output_data + "songplays.parquet")
def process_log_data(spark, input_data, output_data): # get filepath to log data file log_data = '{}log_data/*.json'.format(input_data) # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(col("page") == "NextSong") # extract columns for users table users_table = df \ .selectExpr( \ "userId as user_id", "firstName as first_name", \ "lastName as last_name", "gender", "level") \ .dropDuplicates() # write users table to parquet files users_table \ .write \ .parquet(os.path.join(output_data, 'users'), 'overwrite') # create timestamp column from original timestamp column get_timestamp = udf( \ lambda x: datetime.fromtimestamp( (x/1000.0) ), T.TimestampType()) df = df.withColumn("timestamp", get_timestamp(df.ts)) # create datetime columns from original timestamp column df = df \ .withColumn("hour", hour("timestamp")) \ .withColumn("day", dayofmonth("timestamp")) \ .withColumn("week", weekofyear("timestamp")) \ .withColumn("month", month("timestamp")) \ .withColumn("year", year("timestamp")) \ .withColumn("weekday", date_format("timestamp", 'EEEE')) # extract columns to create time table time_table = df \ .selectExpr( \ "ts as start_time", "hour", "day", \ "week", "month", "year", "weekday") \ .dropDuplicates() # write time table to parquet files partitioned by year and month time_table \ .write \ .partitionBy("year", "month") \ .parquet(os.path.join(output_data, 'time'), 'overwrite') # read in song data to use for songplays table song_df = spark \ .read \ .parquet('{}songs/*/*/*.parquet'.format(output_data)) # extract columns from joined song and log datasets to create songplays table songplays_table = df \ .join(song_df, df.song == song_df.title) \ .selectExpr("df.ts AS start_time", "df.user_id", "df.level", \ "song_df.song_id", "song_df.artist_id", "df.session_id", \ "df.location", "df.user_agent", "df.month", "df.year") \ .dropDuplicates() # write songplays table to parquet files partitioned by year and month songplays_table \ .write \ .partitionBy("year", "month") \ .parquet(os.path.join(output_data, 'songplays'), 'overwrite')
def as_spark_type( tpe: Union[str, type, Dtype], *, raise_error: bool = True, prefer_timestamp_ntz: bool = False ) -> types.DataType: """ Given a Python type, returns the equivalent spark type. Accepts: - the built-in types in Python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - Python3's typing system """ if isinstance(tpe, np.dtype) and tpe == np.dtype("object"): pass # ArrayType elif tpe in (np.ndarray,): return types.ArrayType(types.StringType()) elif hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, list): # type: ignore element_type = as_spark_type(tpe.__args__[0], raise_error=raise_error) # type: ignore if element_type is None: return None return types.ArrayType(element_type) # BinaryType elif tpe in (bytes, np.character, np.bytes_, np.string_): return types.BinaryType() # BooleanType elif tpe in (bool, np.bool_, "bool", "?"): return types.BooleanType() # DateType elif tpe in (datetime.date,): return types.DateType() # NumericType elif tpe in (np.int8, np.byte, "int8", "byte", "b"): return types.ByteType() elif tpe in (decimal.Decimal,): # TODO: considering about the precision & scale for decimal type. return types.DecimalType(38, 18) elif tpe in (float, np.float_, np.float64, "float", "float64", "double"): return types.DoubleType() elif tpe in (np.float32, "float32", "f"): return types.FloatType() elif tpe in (np.int32, "int32", "i"): return types.IntegerType() elif tpe in (int, np.int64, "int", "int64", "long"): return types.LongType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() # StringType elif tpe in (str, np.unicode_, "str", "U"): return types.StringType() # TimestampType or TimestampNTZType if timezone is not specified. elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"): return types.TimestampNTZType() if prefer_timestamp_ntz else types.TimestampType() # categorical types elif isinstance(tpe, CategoricalDtype) or (isinstance(tpe, str) and type == "category"): return types.LongType() # extension types elif extension_dtypes_available: # IntegralType if isinstance(tpe, Int8Dtype) or (isinstance(tpe, str) and tpe == "Int8"): return types.ByteType() elif isinstance(tpe, Int16Dtype) or (isinstance(tpe, str) and tpe == "Int16"): return types.ShortType() elif isinstance(tpe, Int32Dtype) or (isinstance(tpe, str) and tpe == "Int32"): return types.IntegerType() elif isinstance(tpe, Int64Dtype) or (isinstance(tpe, str) and tpe == "Int64"): return types.LongType() if extension_object_dtypes_available: # BooleanType if isinstance(tpe, BooleanDtype) or (isinstance(tpe, str) and tpe == "boolean"): return types.BooleanType() # StringType elif isinstance(tpe, StringDtype) or (isinstance(tpe, str) and tpe == "string"): return types.StringType() if extension_float_dtypes_available: # FractionalType if isinstance(tpe, Float32Dtype) or (isinstance(tpe, str) and tpe == "Float32"): return types.FloatType() elif isinstance(tpe, Float64Dtype) or (isinstance(tpe, str) and tpe == "Float64"): return types.DoubleType() if raise_error: raise TypeError("Type %s was not understood." % tpe) else: return None
def process_log_data(spark, input_data, output_data): # get filepath to log data file log_data = 's3a://udacity-dend/log_data/*/*/*/*.json' # read log data file df_log = spark.read.json(log_data) # filter by actions for song plays df_log = df_log.filter(df_log['page'] == 'NextSong') # extract columns for users table users_table = df_log.select("userId", "firstName", "lastName", "gender", "level") # write users table to parquet files users_table.write.mode('overwrite').parquet(output_data + "users") # create timestamp column from original timestamp column get_timestamp = F.udf(lambda x: datetime.fromtimestamp((x / 1000.0)), T.TimestampType()) df_log = df_log.withColumn("timestamp", get_timestamp(df_log.ts)) # create datetime column from original timestamp column df_log = df_log.withColumn( 'date_time', from_unixtime(df_log.ts / 1000).cast(dataType=T.TimestampType())) # extract columns to create time table #start_time, hour, day, week, month, year, weekday time_table = df_log.select( col("date_time").alias("start_time"), year(col('date_time')).alias('year'), month(col('date_time')).alias('month'), dayofmonth(col('date_time')).alias('day'), hour(col('date_time')).alias('hour'), weekofyear(col('date_time')).alias('week')) # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year", "month").parquet(output_data + "time") # extract columns from joined song and log datasets to create songplays table #must change dataframe to table for SQL functions df_log.createOrReplaceTempView("log_df_table") df_songs.createOrReplaceTempView("song_df_table") time_table.createOrReplaceTempView("time_table_table") songplays_table = spark.sql(""" SELECT DISTINCT log_df_table.userId, log_df_table.level, log_df_table.location, log_df_table.userAgent, log_df_table.sessionId, log_df_table.date_time, song_df_table.artist_id, song_df_table.song_id, time_table_table.month, time_table_table.year FROM log_df_table JOIN song_df_table ON song_df_table.artist_name = log_df_table.artist JOIN time_table_table ON time_table_table.start_time = log_df_table.date_time """) #monotonically_increasing_id assigns unique id number to each row songplays_table = songplays_table.withColumn("songplay_id", monotonically_increasing_id()) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year", "month").parquet(output_data + "songplays")
import sys, re, math, datetime, json from pyspark import SparkConf, SparkContext assert sys.version_info >= (3, 5) # make sure we have Python 3.5+ from pyspark.sql import SQLContext, Row, SparkSession, functions as f, types as t nasa_schema = t.StructType([ t.StructField('hostname', t.StringType(), False), t.StructField('path', t.StringType(), False), t.StructField('size', t.FloatType(), False), t.StructField('timestamp', t.TimestampType(), False) ]) def make_rdd(line): return Row(hostname=line[1], timestamp=datetime.datetime.strptime(line[2], '%d/%b/%Y:%H:%M:%S'), path=line[3], size=float(line[4])) def main(inputs): text = sc.textFile(inputs) linere = re.compile( "^(\\S+) - - \\[(\\S+) [+-]\\d+\\] \"[A-Z]+ (\\S+) HTTP/\\d\\.\\d\" \\d+ (\\d+)$" ) nasa_rdd = text.map(lambda x: linere.split(x)).filter( lambda line: len(line) > 4).map(make_rdd) nasa_df = sqlContext.createDataFrame(nasa_rdd, nasa_schema) agg_df = nasa_df.groupBy(nasa_df.hostname).agg(f.count('*').alias('x') \ ,f.sum(nasa_df.size).alias('y')) \
from datetime import datetime assert sys.version_info >= (3, 5) # make sure we have Python 3.5+ from pyspark.sql import SparkSession, functions, types cluster_seeds = ['199.60.17.32', '199.60.17.65'] spark = SparkSession.builder.appName('Spark Cassandra example') \ .config('spark.cassandra.connection.host', ','.join(cluster_seeds)).getOrCreate() spark.sparkContext.setLogLevel('WARN') sc = spark.sparkContext line_re = re.compile( r'^(\S+) - - \[(\S+ [+-]\d+)\] \"[A-Z]+ (\S+) HTTP/\d\.\d\" \d+ (\d+)$') schema = types.StructType([ #types.StructField('id', types.StringType()), types.StructField('host', types.StringType()), types.StructField('datetime', types.TimestampType()), types.StructField('path', types.StringType()), types.StructField('bytes', types.IntegerType()) ]) def read_line(line): m = line_re.match(line) if m is None: return None return (m.group(1), datetime.strptime(m.group(2), '%d/%b/%Y:%H:%M:%S %z'), m.group(3), int(m.group(4))) def main(input_dir, keyspace, table): text = sc.textFile(input_dir).repartition(10)
def process_log_data(spark, input_data, output_data): """ This function reads song_data and log_data from S3 (public folder), makes user, artists and songplay tables and uploads them back on S3 Parameters: spark : SparkSession input_data : public S3 path where input data is scored output_data : Our S3 path where output data is scored Returns: user, artists, songplay tables are saved as parquet files on a personal folder on S3 """ # get filepath to log data file log_data = f'{input_data}/log_data/*.json' # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df.page == 'NextSong') # extract columns for users table user_table = df.select( ["userId", "firstname", "lastname", "gender", "level"]).where(df["userId"].isNotNull()) # write users table to parquet files user_data_out = f'{output_data}/user_table/user_table.parquet' user_table.write.mode('overwrite').parquet(user_data_out) # create timestamp column from original timestamp column get_timestamp = F.udf(lambda x: datetime.fromtimestamp((x / 1000.0)), T.TimestampType()) df = df.withColumn("timestamp", get_timestamp(df.ts)) # extract columns to create time table time_table = df.select(['timestamp']).dropDuplicates() time_table = time_table.withColumn("hour", hour(time_table["timestamp"])) time_table = time_table.withColumn("day", dayofyear(time_table["timestamp"])) time_table = time_table.withColumn("week", weekofyear(time_table["timestamp"])) time_table = time_table.withColumn("month", month(time_table["timestamp"])) time_table = time_table.withColumn("year", year(time_table["timestamp"])) time_table = time_table.withColumn("weekday", dayofweek(time_table["timestamp"])) # write time table to parquet files partitioned by year and month time_data_out = f'{output_data}/time_table/time_table.parquet' time_table.write.mode('overwrite').partitionBy( 'year', 'month').parquet(time_data_out) # read in song data to use for songplays table song_data = f'{input_data}/song_data/*/*/*/*.json' sdf = spark.read.json(song_data) sdf.createOrReplaceTempView("song_df_table") # Adding month and year column to log data read and preparing log data table df = df.withColumn("month", month(df["timestamp"])) df = df.withColumn("year", year(df["timestamp"])) df.createOrReplaceTempView("log_df_table") # extract columns from joined song and log datasets to create songplays table songplays_table = spark.sql(""" SELECT ldf.timestamp as start_time, ldf.userid as user_id, ldf.level, sdf.song_id, sdf.artist_id, ldf.sessionid as session_id, ldf.location, ldf.useragent as user_agent, ldf.month, ldf.year FROM log_df_table ldf JOIN song_df_table sdf ON (ldf.song = sdf.title) AND (ldf.artist = sdf.artist_name) AND (ldf.length = sdf.duration) WHERE ldf.page = 'NextSong' and ldf.userid is not null """) # adding the songplay_id column window = Window.orderBy(F.col('start_time')) songplays_table = songplays_table.withColumn('songplay_id', F.row_number().over(window)) songplays_table.select('songplay_id', 'start_time', 'user_id', 'level', 'song_id', 'artist_id', 'session_id', 'location', 'user_agent', 'month', 'year').show() # write songplays table to parquet files partitioned by year and month songplays_data_out = f'{output_data}/songplays_table/songplays_table.parquet' songplays_table.write.mode('overwrite').partitionBy( 'year', 'month').parquet(songplays_data_out)
#types.BooleanType : bool, types.LongType: int, types.IntegerType: int, types.DoubleType: float, types.DecimalType: float, types.StringType: str, types.TimestampType: datetime } # map data type to pyspark sql type _data_type_to_pyspark_type_table = { int: types.IntegerType(), long: types.LongType(), float: types.DoubleType(), str: types.StringType(), datetime: types.TimestampType() } # build reverse map string -> type _primitive_str_to_type_table = dict([ (s, t) for t, s in _primitive_type_to_str_table.iteritems() ]) _primitive_alias_type_to_type_table = { float: float64, int: int32, long: int64, str: unicode, list: vector, }
import pyspark.sql.functions as F import pyspark.sql.types as T from pyspark.sql.functions import struct, window, col, lit from utils.spark import kafka_source import config a3_struct_common = T.StructType([ T.StructField("timetamp_start", T.TimestampType()), T.StructField("timetamp_end", T.TimestampType()), T.StructField("country_name", T.StringType()), T.StructField("topic_name_exp", T.StringType()), T.StructField("topic_sum", T.IntegerType()), ]) def task_a_3_step_0(json_parsed_df): result = json_parsed_df.withColumn('topic_name_exp', F.explode('topic_name')) \ .withWatermark("timestamp", "1 minute").groupBy( F.window("timestamp", "1 hour", "1 hour"), 'country_name', 'topic_name_exp' ).agg( F.count('topic_name_exp').alias('topic_count') ).select( F.struct( col('window.end').alias("datetime_end"), col('country_name'), col('topic_name_exp'),
def _apply_dateutil_parse(column): assert len(column.columns) == 1, "Expected DataFrame with 1 column" col_name = column.columns[0] _udf = udf(parse, sparktypes.TimestampType()) return column.withColumn(col_name, _udf(col_name))
# Read in raw data impressionFields = [ T.StructField('advertiserID', T.IntegerType(), False), T.StructField('domain', T.StringType(), False), T.StructField('viewable', T.BooleanType(), False), T.StructField('city', T.StringType(), False), T.StructField('mobileDevice', T.StringType(), False), T.StructField('country', T.StringType(), False), T.StructField('sellerPrice', T.IntegerType(), False), T.StructField('userID', T.IntegerType(), False), T.StructField('impressionID', T.IntegerType(), False), T.StructField('postalCode', T.StringType(), False), T.StructField('carrier', T.StringType(), False), T.StructField('eventType', T.StringType(), False), T.StructField('lineItemID', T.IntegerType(), False), T.StructField('time', T.TimestampType(), False), T.StructField('duration', T.IntegerType(), False), T.StructField('browser', T.StringType(), False), T.StructField('os', T.StringType(), False), T.StructField('audienceSegmentID', T.IntegerType(), False) ] impressionSchema = T.StructType(impressionFields) currentDate = dt.now().strftime('%Y-%m-%d') impressionsFiles = os.path.join('gs://sgupta_doubleclick', 'staging', '*', currentDate, 'impressions.csv') rawImpressions = spark.read.load(impressionsFiles, format="csv", header=True, schema=impressionSchema)\ .withColumn('filename', F.input_file_name())\ .withColumn('clientID',F.regexp_extract('filename','.*staging/([0-9]*)/.*',1).cast('int'))\ .withColumn('date',F.regexp_extract('filename','.*staging/[0-9]*/([0123456789-]*)/.*',1).cast('date'))\
from pyspark.sql import types # base type DType = types.DataType # individual types String = types.StringType() Date = types.DateType() Datetime = types.TimestampType() # numeric types Float = types.FloatType() Double = types.DoubleType() Byte = types.ByteType() Short = types.ShortType() Integer = types.IntegerType() Long = types.LongType() # groups Floats = (Float, Double) Integers = (Byte, Short, Integer, Long) Numerics = Floats + Integers
from pyspark.ml.feature import VectorAssembler from pyspark.ml.classification import LinearSVC from pyspark.ml.clustering import KMeans, KMeansModel from pyspark.ml import Pipeline from pyspark.ml.feature import MinMaxScaler from pyspark.ml.linalg import Vectors spark = SparkSession.builder.appName('commuter').getOrCreate() assert spark.version >= '2.4' # make sure we have Spark 2.4+ spark.sparkContext.setLogLevel('WARN') #sc = spark.sparkContext amenity_schema = types.StructType([ types.StructField('lat', types.DoubleType(), nullable=False), types.StructField('lon', types.DoubleType(), nullable=False), types.StructField('timestamp', types.TimestampType(), nullable=False), types.StructField('amenity', types.StringType(), nullable=False), types.StructField('name', types.StringType(), nullable=True), types.StructField('tags', types.MapType(types.StringType(), types.StringType()), nullable=False), ]) def main(inputs): poi = spark.read.json(inputs, schema=amenity_schema) # poi.show() stage1 = VectorAssembler(inputCols=['lon', 'lat'], outputCol='features') stage2 = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") stage3 = KMeans().setK(7).setFeaturesCol( "scaledFeatures").setPredictionCol('prediction')
return _Unknown(tpe) else: return _Scalar(inner) # First element of the list is the python base type _base = { types.StringType(): [str, "str", "string"], types.BinaryType(): [bytes], types.ByteType(): [np.int8, "int8", "byte"], types.ShortType(): [np.int16, "int16", "short"], types.IntegerType(): [int, "int", np.int, np.int32], types.LongType(): [np.int64, "int64", "long", "bigint"], types.FloatType(): [float, "float", np.float], types.DoubleType(): [np.float64, "float64", "double"], types.TimestampType(): [datetime.datetime, np.datetime64], types.DateType(): [datetime.date], types.BooleanType(): [bool, "boolean", "bool", np.bool], types.ArrayType(types.StringType()): [], } def _build_type_dict(): return dict([(other_type, spark_type) for (spark_type, l) in _base.items() for other_type in l] + [(spark_type, spark_type) for (spark_type, _) in _base.items()]) def _build_py_type_dict(): return dict([(spark_type, l[0]) for (spark_type, l) in _base.items() if len(l) > 0])
def process_log_data(spark, input_data, output_data): """ It creates time, user dimensions and songplays fact by processing source log dataset and writes these result data as parquet files in S3. Parameters: spark: spark session input_data: root path of source data output_data: root path of target """ # turn Python function into Pyspark function timestamp_udf = F.udf(get_timestamp, T.TimestampType()) # get filepath to log data file log_data_path =input_data + "log_data/*/*/*.json" print(log_data_path) # read log data file log_data_df = spark.read.json(log_data_path) # filter by actions for song plays log_data_df = log_data_df.filter((log_data_df.page== 'NextSong')) #convert unixtimestamp "ts" column to timestamp log_data_df= log_data_df.withColumn('timestamp', timestamp_udf("ts")) # extract columns for users table #log data has duplicated UserIds.But we want only one row for each user. ## So we get the most recent record of the user by using a row_number. user_columns= ['userId', 'firstName', 'lastName', 'gender', 'level', 'timestamp'] ## remove duplicates using row_number. user_df_rn = log_data_df.select(*user_columns)\ .withColumn('row_num', F.row_number().over(Window.partitionBy("userId").orderBy(F.desc("timestamp")))) users_table_df = user_df_rn.filter((user_df_rn.row_num)==1).select(*user_columns[0:-1]) # write users table to parquet files users_table_df.coalesce(5).write.parquet(output_data+'users_table.parquet', mode='overwrite') # extract columns to create time table time_table_df= log_data_df.select( log_data_df.timestamp.alias('start_time'), F.hour(log_data_df.timestamp).alias('hour'), F.dayofmonth(log_data_df.timestamp).alias('day'), F.weekofyear(log_data_df.timestamp).alias('week'), F.month(log_data_df.timestamp).alias('month') , F.year(log_data_df.timestamp).alias('year'), F.dayofweek(log_data_df.timestamp).alias('weekday')).dropDuplicates() # write time table to parquet files partitioned by year and month time_table_df.write.partitionBy("year","month").parquet(output_data+'time_table.parquet', mode='overwrite') #get full path song_data=input_data + 'song_data/*/*/*/*.json' # read song data file song_data_df = spark.read.json(song_data) # extract columns from joined song and log datasets to create songplays table songplays_table_df= log_data_df.join\ (song_data_df, (log_data_df.artist == song_data_df.artist_name) & (log_data_df.song== song_data_df.title) & ( log_data_df.length== song_data_df.duration), how='inner')\ .select(log_data_df.timestamp , log_data_df.userId, log_data_df.level, song_data_df.song_id, song_data_df.artist_id, log_data_df.sessionId, log_data_df.location, log_data_df.userAgent )\ .withColumn('year',F.year(log_data_df.timestamp))\ .withColumn('month',F.month(log_data_df.timestamp)) # write songplays table to parquet files partitioned by year and month songplays_table_df.write.partitionBy("year", "month").parquet(output_data+'songplays_table.parquet', mode='overwrite') print(users_table_df.count()) print(time_table_df.count()) print(songplays_table_df.count())
import datetime from pyspark.sql import functions as F from pyspark.sql import types as T complex_event_expression = (F.when( F.col("nested.input_key_1").isNotNull(), F.col("nested.input_key_1") / 1000).otherwise( F.col("nested.input_key_2") / 1000).cast(T.TimestampType()).cast( T.DateType())) # fmt: off fixtures_for_spark_sql_object = [ # input_value_1 # input_value_1 # mapper function # expected_value ("place_holder", "place_holder", F.current_date(), datetime.date.today()), ("place_holder", "place_holder", F.current_timestamp(), datetime.datetime.now()), ("some string", "place_holder", F.col("nested.input_key_1"), "some string"), ("some string to count", "place_holder", F.length("nested.input_key_1"), 20), ("some string", None, F.coalesce("nested.input_key_1", "nested.input_key_2"), "some string"), (None, "some other string", F.coalesce("nested.input_key_1", "nested.input_key_2"), "some other string"), (1597069446, "placeholder", (F.col("nested.input_key_1").cast(T.TimestampType()).cast(T.DateType())), datetime.date(2020, 8, 10)), (1597069446000, None, complex_event_expression, datetime.date(2020, 8, 10)), (None, 1597069446000, complex_event_expression, datetime.date(2020, 8,
#READ STRATEGIES strategies = sqlContext.read.json(sys.argv[2]) #filter strategy strategies = strategies.filter( col("rationality") == float(sys.argv[3])).filter( col("num_of_hp") == float(sys.argv[4])) #add unique name strategies_full = strategies.withColumn( "strategyID", concat(col("date"), lit("_"), col("num_of_hp"), lit("_"), col("rationality"))) #cast date to timestamp strategies = strategies_full.withColumn( 'date', unix_timestamp('date', 'yyyy-MM-dd').cast(T.TimestampType())) #unfold the structs strategies = strategies_full.select("date", "strategyID", explode("stg")).select( "date", "strategyID", "col.port", "col.prob") #add new column with date when the strategy should be applied strategies = strategies.withColumn('application_date', F.date_add(strategies['date'], 1)).select( "application_date", "strategyID", "port", "prob") strategies = strategies.withColumn( 'application_date', unix_timestamp('application_date', 'yyyy-MM-dd hh:mm:ss').cast(T.TimestampType()))
def main(): # main logic starts here #Read bike_trips_data data = spark.read.format("org.apache.spark.sql.cassandra").options( table='transaction_data2', keyspace="bike_share_analytics").load() data = data.dropna() #read bike station data df_station = spark.read.format("org.apache.spark.sql.cassandra").options( table='station_data', keyspace="bike_share_analytics").load() df_station = df_station.select("id", "weather_station_id") #join data cond = [data['start station id'] == df_station.id] df_combined = data.join(df_station, cond, how='inner') df_combined = df_combined.select( 'tripduration', 'starttime', 'stoptime', 'start station id', 'start station name', 'start station latitude', 'start station longitude', 'end station id', 'end station name', 'end station latitude', 'end station longitude', 'bikeid', 'usertype', 'birth year', 'gender', 'weather_station_id') #Create new columns df_combined = df_combined.withColumn( 'startyear', functions.year(df_combined['starttime'])) df_combined = df_combined.withColumn( 'dayofyear', functions.dayofyear(df_combined['starttime'])) ##Adding weather data to trips data df_weather = spark.read.format("org.apache.spark.sql.cassandra").options( table='weather_data1', keyspace="bike_share_analytics").load() df_weather = df_weather.withColumn( "timestamp", functions.unix_timestamp('time', "yyyy-MM-dd HH:mm:ss").cast( types.TimestampType())) df_weather = df_weather.withColumn('hour_weather', functions.hour(df_weather['timestamp'])) df_weather = df_weather.withColumn( 'month_weather', functions.month(df_weather['timestamp'])) df_weather = df_weather.withColumn('year_weather', functions.year(df_weather['timestamp'])) df_weather = df_weather.withColumn( 'dayofyear_weather', functions.dayofyear(df_weather['timestamp'])) cond1 = [ df_combined.weather_station_id == df_weather.id, df_combined.startyear == df_weather.year_weather, df_combined.dayofyear == df_weather.dayofyear_weather, df_combined.starthour == df_weather.hour_weather ] df_combined_new = df_combined.join(df_weather, cond1, how="inner") df_combined_new = df_combined_new.select( 'tripduration', 'starttime', 'stoptime', 'start station id', 'start station name', 'start station latitude', 'start station longitude', 'end station id', 'end station name', 'end station latitude', 'end station longitude', 'bikeid', 'usertype', 'birth year', 'gender', 'age', 'dayofyear', 'starthour', 'startmonth', 'startyear', 'stophour', 'stopmonth', 'weekday', 'weekend', 'temperature', 'precipitation', 'humidity', 'dewpoint', 'windspeed') #pushing data to cassandra df_combined_new.write.format("org.apache.spark.sql.cassandra") \ .options(table='transaction_data5', keyspace='bike_share_analytics').save()