#---------------------------------------------------------------------------- ## Main functionality if __name__ == "__main__": main_config_file_filter = None errorCount = 0 workflowStartTime = datetime.datetime.now() if len(sys.argv) > 1: main_config_file = sys.argv[1] if len(sys.argv) > 2: main_config_file_filter = sys.argv[2] spark.udf.register('udfConvertInt', convertInt, IntegerType()) spark.udf.register('udfConvertDouble', convertDouble, DoubleType()) spark.udf.register('udfConvertDatetime', convertDatetime, TimestampType()) mainConfig = spark.read.load(main_config_file, format="csv", delimiter="|", header=True) #Opretaion|LoadType|threads|Server|Database|t|WhereClause|DeltaColumn|UniqueIdentifiers|PartitionColumn|TargetLocationRaw|TargetLocationCooked|TargetLocationTableSchema|HiveDatabase|HiveTable|Comments if (main_config_file_filter is not None): mainConfig = mainConfig.filter(main_config_file_filter) for row in mainConfig.collect(): try: print( "===================================================================================================="
from datetime import datetime from pyspark.sql import SparkSession from pyspark.sql.functions import col, date_format, udf from pyspark.sql.types import (DateType, IntegerType, FloatType, StructField, StructType, TimestampType) spark = SparkSession.builder.appName("Read Transactions").getOrCreate() csv_schema = StructType([ StructField('customer_id', IntegerType()), StructField('amount', FloatType()), StructField('purchased_at', TimestampType()), ]) dataframe = spark.read.csv("transactions.csv", schema=csv_schema, header=True) dataframe.show() # Add a new column by formatting the original date formatted_df = dataframe.withColumn( "date_string", date_format(col("purchased_at"), 'MM/dd/yyyy')) formatted_df.show() # Create a user defined function string_to_date = \ udf(lambda text_date: datetime.strptime(text_date, '%m/%d/%Y'), DateType()) typed_df = formatted_df.withColumn("date", string_to_date(formatted_df.date_string))
def process_log_data(spark, input_data, output_data): """ Process log data from json files and create users, time, and songplays tables in parquet. """ # get filepath to log data file log_data = input_data # read log data file df = spark.read.json("{}log_data/*/*/*.json".format(log_data)) # filter by actions for song plays df = df.filter(df['page'] == "NextSong") # extract columns for users table users_table = df.withColumn("last_stamp", max_(col('ts')).over(Window.partitionBy("userId"))) \ .filter(col('ts') == col('last_stamp')) \ .select('userId', 'firstName', 'lastName', 'gender', 'level') # write users table to parquet files users_table.write.parquet(os.path.join( output_data, 'users'), mode='overwrite') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp( x/1000.0), TimestampType()) df = df.withColumn("timestamp", get_timestamp(col('ts'))) # create datetime column from original timestamp column get_datetime = udf(lambda x: datetime.fromtimestamp(x/1000.0), DateType()) df = df.withColumn("datetime", get_datetime(col('ts'))) # extract columns to create time table time_table = df.withColumn('hour', hour(df.timestamp)) \ .withColumn('day', dayofmonth(df.timestamp)) \ .withColumn('week', weekofyear(df.timestamp)) \ .withColumn('month', month(df.timestamp)) \ .withColumn('year', year(df.timestamp)) \ .withColumn('weekday', date_format('timestamp', 'u')) \ .select('timestamp', 'hour', 'day', 'week', 'month', 'year', 'weekday').distinct() # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'time_tbl'), mode='overwrite') # read in song data to use for songplays table song_df = spark.read.parquet(os.path.join(output_data, 'songs')) artist_df = spark.read.parquet(os.path.join(output_data, 'artists')) # extract columns from joined song and log data`sets to create songplays table songplays_table = df.join(song_df, (df.song == song_df.title) & (df.length == song_df.duration), 'left_outer') \ .join(artist_df, (song_df.artist_id == artist_df.artist_id) & (df.artist == artist_df.artist_name), 'left_outer') \ .select( df.timestamp.alias("start_time"), df.userId.alias("user_id"), df.level, song_df.song_id, song_df.artist_id, df.sessionId.alias( "session_id"), df.location, df.userAgent.alias("user_agent") ).withColumn("songplay_id", monotonically_increasing_id()) # write songplays table to parquet files partitioned by year and month songplays_table.join(time_table, (songplays_table.start_time == time_table.timestamp)) \ .select(songplays_table["*"], time_table.year, time_table.month) \ .write.partitionBy('year', 'month').parquet(os.path.join(output_data, 'songplays'), mode='overwrite')
"struct": StructType, "array": ArrayType, "bigint": LongType, "date": DateType, "byte": ByteType, "short": ShortType, "datetime": TimestampType, "binary": BinaryType, "null": NullType, "vector": VectorUDT } SPARK_DTYPES_DICT_OBJECTS = \ {"string": StringType(), "int": IntegerType(), "float": FloatType(), "double": DoubleType(), "boolean": BooleanType(), "struct": StructType(), "array": ArrayType(StringType()), "bigint": LongType(), "date": DateType(), "byte": ByteType(), "short": ShortType(), "datetime": TimestampType(), "binary": BinaryType(), "null": NullType() } PROFILER_COLUMN_TYPES = { "categorical", "numeric", "date", "null", "array", "binary" } PYTHON_TO_PROFILER = { "string": "categorical", "boolean": "categorical", "int": "numeric", "decimal": "numeric", "date": "date", "array": "array", "binaty": "binary", "null": "null" } SPARK_DTYPES_TO_PROFILER = {
def test_timestamp_microsecond(self): tst = TimestampType() self.assertEqual( tst.toInternal(datetime.datetime.max) % 1000000, 999999)
#Before Spark 1.4 train = sqlContext.load(source="com.databricks.spark.csv", path = 'PATH/train.csv', header = True,inferSchema = True) test = sqlContext.load(source="com.databricks.spark.csv", path = 'PATH/test-comb.csv', header = True,inferSchema = True) #Current Spark 2.1 and ... from pyspark .sql import SparkSession spark = SparkSession.builder.master("yarn").getOrCreate() df = spark.read.csv('hdfs://hadoop-master:9000/index/train.csv',mode="DROPMALFORMED") #Defining schema with ArrayType schema = StructType([StructField('array_column', ArrayType(StructType([StructField('element_of_array', StringType(), True)])), True)]) #From local : The third parameter i.e. Boolean Type : True / False denote whether the corresponding filed can be nullable from pyspark.sql.types import StructType,StructField,LongType,StringType,TimestampType schema=StructType([StructField('col0', LongType(), True), StructField('col1', LongType(), True), StructField('col2', StringType(), True), StructField('col3', StringType(), True),StructField('col4',TimestampType(),True),StructField('col5',TimestampType(),True),StructField('col6',StringType(),True)]) df = spark.read.csv('file:///index/data_extract_restart2_without_cert/data_refined.csv',mode="DROPMALFORMED"),schema=schema) #Creating UDF def dict(sk): new_sk=sk.replace(',','|')#replacing comma by pipe in column col2 and putting the result in column named new_column_name return new_sk udf_dict = udf(dict, StringType()) df.withColumn('new_column_name', udf_dict("col2")).write.csv(path="/index/skill_clean_v3")#col2 is the column to be changed df.write.csv('/data/file_csv/', mode="overwrite") # May also add: mode="overwrite", sep="\t" #TIP : If a dataframe has list / array in one of its' column (like 'student_name_list') it can't be written into a csv file directly
from pyspark.sql import SparkSession from pyspark.sql.functions import explode from pyspark.sql.functions import split from pyspark.sql.types import StructField, StructType, StringType, IntegerType, TimestampType from pyspark.sql.functions import window import time spark = SparkSession \ .builder \ .appName("carStreaming") \ .getOrCreate() schema = StructType([ StructField('type', StringType(), True), StructField('color', StringType(), True), StructField('timestamp', TimestampType(), True)]) # Create DataFrame representing the stream of input lines from connection to localhost:9999 fileStreamDf = spark \ .readStream \ .option("header", "true") \ .schema(schema) \ .option("inferSchema", "true") \ .csv("/home/hadoop/spark-streaming/data/") aggDF = fileStreamDf.groupBy("type").count() # windowedCounts = fileStreamDf \ # .withWatermark("timestamp", "2 minutes") \ # .groupBy(window(fileStreamDf.timestamp, "10 minutes", "5 minutes"), fileStreamDf.type).count()
def convert_extract_to_parquet(extract_loc, save_dir, spark=None): if not spark: spark = SparkSession \ .builder \ .appName("shared") \ .getOrCreate() # https://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=read%20csv # ignore (escape) " if already within quotes to avoid splitting by , within the columnwise jsons # NullPointerException if you try to access something you promised would never be null # cannot read nested structs straight from CSV, sadly, so will parse columns individually metadata_struct = StructType([ StructField('source', StringType(), False), StructField('session', StringType(), False), StructField( 'viewport', StructType([ StructField('width', StringType(), False), StructField('height', StringType(), False) ]), False), StructField('started_at', TimestampType(), False), StructField('user_agent', StringType(), False), StructField('utc_offset', StringType(), False), StructField('finished_at', TimestampType(), False), StructField('live_project', BooleanType(), False), StructField('interventions', StringType(), False), # actually struct StructField('user_language', StringType(), False), StructField('source', StringType(), False), StructField('subject_dimensions', StringType(), False), # actually struct StructField('subject_selection_state', StringType(), False), # actually struct StructField('workflow_translation_id', StringType(), True), # actually struct, sometimes null ]) # TODO answer (at the very least) is very occasionally null, and this causes either EOF/Null pointer (if not nullable) or raise error like # ValueError: Answer None of type <class 'NoneType'> not found in schema for question T0 # should filter out tasks with missing keys for these annotations_struct = ArrayType( StructType([ StructField('task', StringType(), True), StructField('task_id', StringType(), True), StructField('task_label', StringType(), True), StructField('value', StringType(), True), StructField('multiple_choice', BooleanType(), True), ])) # subject_data_internal_struct = StructType( # # StructField('!iauname', StringType(), True), # # StructField('iauname', StringType(), True) # ) # subject_data_struct = ArrayType(MapType(StringType(), subject_data_internal_struct)) schema = StructType([ StructField('classification_id', StringType(), False), StructField('user_name', StringType(), True), StructField('user_id', StringType(), True), StructField('user_ip', StringType(), True), StructField('workflow_id', StringType(), False), StructField('workflow_name', StringType(), False), StructField('workflow_version', FloatType(), False), StructField('created_at', StringType(), False), StructField('gold_standard', StringType(), False), StructField('expert', StringType(), False), StructField('metadata', StringType(), False), StructField('annotations', StringType(), False), StructField('subject_data', StringType(), False), StructField('subject_ids', StringType(), False) ]) # schema = StructType([ # StructField('name', StructType([ # StructField('firstname', StringType(), True), # StructField('middlename', StringType(), True), # StructField('lastname', StringType(), True) # ])), # StructField('id', StringType(), True), # StructField('gender', StringType(), True), # StructField('salary', IntegerType(), True) # ]) ds = spark.read.csv(extract_loc, header=True, quote='"', escape='"', schema=schema, mode='FAILFAST') # for debugging # ds = ds.sample(withReplacement=False, fraction=.1, seed=42) # print(ds.head()) # need to unpack metadata and subject data # print(ds.head()['metadata']) # print(ds.head()['annotations']) metadata_str_to_struct_udf = udf(metadata_str_to_struct, returnType=metadata_struct) annotations_str_to_struct_udf = udf(annotation_to_struct, returnType=annotations_struct) subject_data_str_to_iauname_udf = udf(subject_data_str_to_iauname, returnType=StringType()) get_person_id_udf = udf(get_person_id, returnType=StringType()) ds = ds.withColumn('metadata', metadata_str_to_struct_udf(ds['metadata'])) ds = ds.withColumn('annotations', annotations_str_to_struct_udf(ds['annotations'])) ds = ds.withColumn('iauname', subject_data_str_to_iauname_udf(ds['subject_data'])) ds = ds.withColumn('person_id', get_person_id_udf(ds['user_id'], ds['user_ip'])) ds = ds.withColumnRenamed('subject_ids', 'subject_id') ds = ds.withColumn( 'project_id', lit('5733') ) # TODO hardcoded for now as not in export. lit to make it a column, as Spark requires. flattened = flatten.api_df_to_responses(ds) flattened.write.parquet(save_dir, mode='overwrite')
& (song_df.title == logs_df.song) ).select(logs_df.ts, logs_df.userId.alias('user_id'), logs_df.level, song_df.song_id, song_df.artist_id, logs_df.sessionId.alias('session_id'), logs_df.location, logs_df.userAgent.alias('user_agent'))\ .withColumn('songplay_id', F.monotonically_increasing_id())\ .withColumn('start_time', get_datetime_from(logs_df.ts))\ .withColumn('year', F.year('start_time'))\ .withColumn('month', F.month('start_time')) # partition songplays_table by year and month songplays_table = songplays_table.repartition('year', 'month') # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy('year', 'month').mode('overwrite').save( f"{S3_OUTPUT_PATH}/songplays_table.parquet") @udf(TimestampType()) def get_datetime_from(long_value): """ Converts timestamp of type Long to datetime """ return datetime.fromtimestamp(long_value / 1000.0) def main(): """ Orchestrates the ETL """ spark = create_spark_session() input_data = "s3a://udacity-dend"
from pyspark.sql.types import StructType, StructField, TimestampType, DecimalType, StringType, DoubleType from pySparkManager import createSpark from createTargetList import extractTarget from createCorpus import createCorpusForUser from generateResponse import generateTweet # structure from tweet dtypes = StructType([ StructField("created_at", TimestampType(), True), StructField("tweet_id", StringType(), False), StructField("tweet", StringType(), False), StructField("likes", DecimalType(38, 0), False), StructField("retweet_count", DecimalType(38, 0), False), StructField("source", StringType(), True), StructField("user_id", DecimalType(38, 0), False), StructField("user_name", StringType(), True), StructField("user_screen_name", StringType(), False), StructField("user_description", StringType(), True), StructField("user_join_date", TimestampType(), True), StructField("user_followers_count", DecimalType(38, 0), False), StructField("user_location", StringType(), True), StructField("lat", DoubleType(), True), StructField("long", DoubleType(), True), StructField("city", StringType(), True), StructField("country", StringType(), True), StructField("continent", StringType(), True), StructField("state", StringType(), True), StructField("state_code", StringType(), True), StructField("collected_at", TimestampType(), False) ])
def read_data(self): userSchema = StructType([ StructField('medallion', StringType()), StructField('pickup_time', TimestampType()), StructField('total_amount', DoubleType()), ]) self.fare = self.spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("subscribe", "nycfare1") \ .option("startingOffsets", "earliest") \ .option('failOnDataLoss','false') \ .option("maxOffsetsPerTrigger", 1000) \ .load() self.df_fare = self.fare.selectExpr("CAST(value as STRING) as json") \ .select(from_json("json", userSchema).alias('data'))\ .selectExpr( "data.medallion as medallion_fare", "cast (data.pickup_time as timestamp) as pickup_time_fare", "cast (data.total_amount as float)", ) userSchema = StructType([ StructField('medallion', StringType()), StructField('pickup_time', TimestampType()), StructField('dropoff_time', TimestampType()), StructField('passenger_count', IntegerType()), StructField('trip_time', IntegerType()), StructField('trip_distance', DoubleType()), StructField('pickup_loc', MapType(StringType(), DoubleType())), StructField('dropoff_loc', MapType(StringType(), DoubleType())) ]) self.trip = self.spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("subscribe", "nycspeed9") \ .option("startingOffsets", "earliest") \ .option('failOnDataLoss', 'false') \ .option("maxOffsetsPerTrigger", 1000) \ .load() self.df_trip = self.trip.selectExpr("CAST(value as STRING) as json") \ .select(from_json("json", userSchema).alias('data')) \ .selectExpr( "data.medallion as medallion_trip", "cast (data.pickup_time as timestamp) as pickup_time_trip", "cast (data.dropoff_time as timestamp)", "cast (data.passenger_count as integer)", "cast (data.trip_time as integer)", "cast (data.trip_distance as float)", "cast (data.pickup_loc.lat as float) as pickup_loc_lat", # "cast data.pickup_loc.lat as pickup_loc_lat" "cast (data.pickup_loc.lon as float) as pickup_loc_lon", # "cast data.pickup_loc.lon as pickup_loc_lon", "cast (data.dropoff_loc.lat as float) as dropoff_loc_lat", # "cast data.dropoff_loc.lat as dropoff_loc_lat", "cast (data.dropoff_loc.lon as float) as dropoff_loc_lon", # "cast data.dropoff_loc.lon as dropoff_loc_lon" ) print(self.df_trip.printSchema()) self.df = self.df_trip.join( self.df_fare, expr(""" medallion_trip = medallion_fare AND pickup_time_trip >= pickup_time_fare - interval 1 hour AND pickup_time_trip <= pickup_time_fare + interval 1 hour """) ) print((self.df \ .writeStream \ .outputMode("append") \ .format("console") \ .option('truncate','false') .option('numRows', 20) .start() .awaitTermination() )) query = self.windowedCounts.writeStream \ .outputMode("append") \ .queryName("writing_to_es") \ .format("org.elasticsearch.spark.sql") \ .option("checkpointLocation", "/tmp/1") \ .option("es.nodes", "localhost") \ .option("es.port", "9200") \ .option("es.resource", "nycfare2/_doc") \ query.start().awaitTermination()
str(row[5]), \ int(row[6]), \ int(row[7]), \ int(row[8]), \ int(row[9]), \ row[10])) resultMap_FilterUlr = resultMap.map(lambda (a,b,c,d,e,f,g,h,i,j,l): (a,b,c,d,e,f,g,h,i,j,regularExpression(l.split(",")))). \ filter(lambda (a,b,c,d,e,f,g,h,i,j,l): len(l) >1) #put on Json fields = StructType( \ [StructField("GSN", StringType(), False), \ StructField("ChargingID", IntegerType(), False), \ StructField("RecordSequence", IntegerType(), False), \ StructField("RecordOpeningDate", TimestampType(), False), \ StructField("rATType", IntegerType(), False), \ StructField("UserLocation", StringType(), False), \ StructField("Accuracy", IntegerType(), False), \ StructField("BrowsingSession", IntegerType(), False), \ StructField("Uplink", IntegerType(), False), \ StructField("Downlink", IntegerType(), False), \ StructField("Urls", ArrayType(StringType(),False))]) #The new Json Format newStructure = StructType( \ [StructField("GSN", StringType(), False), \ StructField("ChargingID", IntegerType(), False), \ StructField("RecordSequence", IntegerType(), False), \ StructField("RecordOpeningDate", TimestampType(), False), \ StructField("rATType", IntegerType(), False), \
def test_as_spark_type_koalas_dtype(self): type_mapper = { # binary np.character: (np.character, BinaryType()), np.bytes_: (np.bytes_, BinaryType()), np.string_: (np.bytes_, BinaryType()), bytes: (np.bytes_, BinaryType()), # integer np.int8: (np.int8, ByteType()), np.byte: (np.int8, ByteType()), np.int16: (np.int16, ShortType()), np.int32: (np.int32, IntegerType()), np.int64: (np.int64, LongType()), np.int: (np.int64, LongType()), int: (np.int64, LongType()), # floating np.float32: (np.float32, FloatType()), np.float: (np.float64, DoubleType()), np.float64: (np.float64, DoubleType()), float: (np.float64, DoubleType()), # string np.str: (np.unicode_, StringType()), np.unicode_: (np.unicode_, StringType()), str: (np.unicode_, StringType()), # bool np.bool: (np.bool, BooleanType()), bool: (np.bool, BooleanType()), # datetime np.datetime64: (np.datetime64, TimestampType()), datetime.datetime: (np.dtype("datetime64[ns]"), TimestampType()), # DateType datetime.date: (np.dtype("object"), DateType()), # DecimalType decimal.Decimal: (np.dtype("object"), DecimalType(38, 18)), # ArrayType np.ndarray: (np.dtype("object"), ArrayType(StringType())), List[bytes]: (np.dtype("object"), ArrayType(BinaryType())), List[np.character]: (np.dtype("object"), ArrayType(BinaryType())), List[np.bytes_]: (np.dtype("object"), ArrayType(BinaryType())), List[np.string_]: (np.dtype("object"), ArrayType(BinaryType())), List[bool]: (np.dtype("object"), ArrayType(BooleanType())), List[np.bool]: (np.dtype("object"), ArrayType(BooleanType())), List[datetime.date]: (np.dtype("object"), ArrayType(DateType())), List[np.int8]: (np.dtype("object"), ArrayType(ByteType())), List[np.byte]: (np.dtype("object"), ArrayType(ByteType())), List[decimal.Decimal]: (np.dtype("object"), ArrayType(DecimalType(38, 18))), List[float]: (np.dtype("object"), ArrayType(DoubleType())), List[np.float]: (np.dtype("object"), ArrayType(DoubleType())), List[np.float64]: (np.dtype("object"), ArrayType(DoubleType())), List[np.float32]: (np.dtype("object"), ArrayType(FloatType())), List[np.int32]: (np.dtype("object"), ArrayType(IntegerType())), List[int]: (np.dtype("object"), ArrayType(LongType())), List[np.int]: (np.dtype("object"), ArrayType(LongType())), List[np.int64]: (np.dtype("object"), ArrayType(LongType())), List[np.int16]: (np.dtype("object"), ArrayType(ShortType())), List[str]: (np.dtype("object"), ArrayType(StringType())), List[np.unicode_]: (np.dtype("object"), ArrayType(StringType())), List[datetime.datetime]: (np.dtype("object"), ArrayType(TimestampType())), List[np.datetime64]: (np.dtype("object"), ArrayType(TimestampType())), # CategoricalDtype CategoricalDtype(categories=["a", "b", "c"]): ( CategoricalDtype(categories=["a", "b", "c"]), LongType(), ), } for numpy_or_python_type, (dtype, spark_type) in type_mapper.items(): self.assertEqual(as_spark_type(numpy_or_python_type), spark_type) self.assertEqual(koalas_dtype(numpy_or_python_type), (dtype, spark_type)) with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."): as_spark_type(np.dtype("uint64")) with self.assertRaisesRegex(TypeError, "Type object was not understood."): as_spark_type(np.dtype("object")) with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."): koalas_dtype(np.dtype("uint64")) with self.assertRaisesRegex(TypeError, "Type object was not understood."): koalas_dtype(np.dtype("object"))
def process_log_data(spark, input_data, output_data): """ Process the event log file and extract data for table time, users and songplays with Spark. -------- Param: spark: A spark session instance. input_data: input data path. output_data: output data path. Return: None. """ # get filepath to log data file log_data = os.path.join(input_data, 'log_data/') # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df.page == 'NextSong') # extract columns for users table users_table = df.selectExpr('userId as user_id', 'firstName as first_name', 'lastName as last_name', 'gender', 'level').drop_duplicates() # write users table to parquet files users_table.write.mode('overwrite').parquet(output_data + 'users/') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.utcfromtimestamp(int(x)/1000), TimestampType()) df = df.withColumn('timestamp', get_timestamp('ts')) # create datetime column from original timestamp column get_datetime = udf(lambda x: datetime.fromtimestamp(x), TimestampType()) df = df.withColumn('start_time', get_datetime('timestamp')) \ .withColumn('hour', hour('start_time')) \ .withColumn('day', dayofmonth('start_time')) \ .withColumn('week', weekofyear('start_time')) \ .withColumn('month', month('start_time')) \ .withColumn('year', year('start_time')) \ .withColumn('weekday', dayofweek('start_time')) # extract columns to create time table time_table = df.select('start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday').drop_duplicates() # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month').mode('overwrite').parquet(output_data + 'time/') # read in song data to use for songplays table song_df = spark.read.parquet(os.path.join(output_data, 'songs/*/*/*')) songs_logs_df = df.join(song_df, (df.song == song_df.title)) artists_df = spark.read.parquet(os.path.join(output_data, 'artists')) songs_logs_artists_df = songs_logs_df.join(artists_df, (songs_logs_df.artist == artists_df.name)) songplays_df = songs_logs_artists_df.join(time_table, (songs_logs_artists_df.start_time == time_table.start_time), 'left').drop(songs_logs_artists_df.year) # extract columns from joined song and log datasets to create songplays table songplays_table = songplays_df.select( monotonically_increasing_id().alias('songplay_id'), col('start_time'), col('userId').alias('user_id'), col('level'), col('song_id'), col('artist_id'), col('sessionId').alias('session_id'), col('location'), col('userAgent').alias('user_agent'), col('year'), col('month')).drop_duplicates() # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy('year', 'month').mode('overwrite').parquet(output_data + 'songplays/')
def test_historical_feature_retrieval_from_local_spark_session( spark, client, driver_entity, customer_entity, bookings_feature_table, transactions_feature_table, ): schema = StructType([ StructField("customer_id", IntegerType()), StructField("driver_id", IntegerType()), StructField("event_timestamp", TimestampType()), ]) df_data = [ ( 1001, 8001, datetime(year=2020, month=9, day=1), ), ( 2001, 8001, datetime(year=2020, month=9, day=2), ), ( 2001, 8002, datetime(year=2020, month=9, day=1), ), ( 1001, 8001, datetime(year=2020, month=9, day=2), ), ( 1001, 8001, datetime(year=2020, month=9, day=3), ), ( 1001, 8001, datetime(year=2020, month=9, day=4), ), ] temp_dir, file_uri = create_temp_parquet_file(spark, "customer_driver_pair", schema, df_data) customer_driver_pairs_source = FileSource("event_timestamp", "created_timestamp", "parquet", file_uri) joined_df = client.get_historical_features_df( [ "transactions:total_transactions", "bookings:total_completed_bookings" ], customer_driver_pairs_source, ) expected_joined_df_schema = StructType([ StructField("customer_id", IntegerType()), StructField("driver_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("transactions__total_transactions", DoubleType()), StructField("bookings__total_completed_bookings", IntegerType()), ]) expected_joined_df_data = [ (1001, 8001, datetime(year=2020, month=9, day=1), 100.0, 100), (2001, 8001, datetime(year=2020, month=9, day=2), 400.0, 150), (2001, 8002, datetime(year=2020, month=9, day=1), 400.0, None), (1001, 8001, datetime(year=2020, month=9, day=2), 200.0, 150), (1001, 8001, datetime(year=2020, month=9, day=3), 200.0, 150), (1001, 8001, datetime(year=2020, month=9, day=4), 300.0, None), ] expected_joined_df = spark.createDataFrame( spark.sparkContext.parallelize(expected_joined_df_data), expected_joined_df_schema, ) assert_dataframe_equal(joined_df, expected_joined_df) shutil.rmtree(temp_dir)
def process_log_data(spark, input_data, output_data): """ Processing log data (users, time table, songplay) by the JSON given by S3, after data normalization and transformation these data are wrote as parquet files """ """ Proving JSON structure to Spark """ logdata_schema = StructType([ StructField("artist", StringType(), True), StructField("auth", StringType(), True), StructField("firstName", StringType(), True), StructField("gender", StringType(), True), StructField("itemInSession", LongType(), True), StructField("lastName", StringType(), True), StructField("length", DoubleType(), True), StructField("level", StringType(), True), StructField("location", StringType(), True), StructField("method", StringType(), True), StructField("page", StringType(), True), StructField("registration", DoubleType(), True), StructField("sessionId", LongType(), True), StructField("song", StringType(), True), StructField("status", LongType(), True), StructField("ts", LongType(), True), StructField("userAgent", StringType(), True), StructField("userId", StringType(), True), ]) # get filepath to log data file log_data = input_data + 'log-data' # read log data file, JSON structure df = spark.read.json(log_data, schema=logdata_schema) # filter by actions for song plays df = df.filter(col("page") == 'NextSong') # extract columns for users table users_table = df.select( col("userId").alias("user_id"), col("firstName").alias("first_name"), col("lastName").alias("last_name"), "gender", "level") # write users table to parquet files users_table.write.parquet(output_data + "users") tsFormat = "yyyy-MM-dd HH:MM:ss z" # Converting ts to a timestamp format time_table = df.withColumn( 'ts', to_timestamp( date_format((df.ts / 1000).cast(dataType=TimestampType()), tsFormat), tsFormat)) # extract columns to create time table time_table = time_table.select( col("ts").alias("start_time"), hour(col("ts")).alias("hour"), dayofmonth(col("ts")).alias("day"), weekofyear(col("ts")).alias("week"), month(col("ts")).alias("month"), year(col("ts")).alias("year")) # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year", "month").parquet(output_data + "time") # read in song data to use for songplays table song_data = input_data + "song-data/*/*/*/*.json" song_df = spark.read.json(song_data) # extract columns from joined song and log datasets to create songplays table songplays_table = song_df.join(df, song_df.artist_name==df.artist)\ .withColumn("songplay_id", monotonically_increasing_id())\ .withColumn('start_time', to_timestamp(date_format((col("ts") /1000).cast(dataType=TimestampType()), tsFormat),tsFormat))\ .select("songplay_id", "start_time", col("userId").alias("user_id"), "level", "song_id", "artist_id", col("sessionId").alias("session_id"), col("artist_location").alias("location"), "userAgent", month(col("start_time")).alias("month"), year(col("start_time")).alias("year")) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year", "month").parquet(output_data + "songplays")
def process_log_data(spark, input_data, output_data): """ Description: This function fetches log_data from S3 into a staging dataframe, then extracts the time, users and songplays tables, and eventually exports data back to S3 Parameters: spark : object for Spark Session input_data : location of log_data output_data : location of target S3 bucket """ # get filepath to log data file log_data = input_data + 'log_data/*/*/*.json' # define schema logdata_schema = R([ Fld("artist",Str()), Fld("auth",Str()), Fld("firstName",Str()), Fld("gender", Str()), Fld("itemInSession", Lng()), Fld("lastName", Str()), Fld("length", Str()), Fld("level", Str()), Fld("location", Str()), Fld("method", Str()), Fld("page", Str()), Fld("registration", Dbl()), Fld("sessionId", Int()), Fld("song", Str()), Fld("status", Lng()), Fld("ts", Lng()), Fld("user_agent", Str()), Fld("userId", Str()) ]) # read log data file df = spark.read.json(log_data, schema=logdata_schema) # filter by actions for song plays df = df.filter(df1['page'] == 'NextSong') # extract columns for users table selection = ['userId as user_id', 'firstName as first_name', \ 'lastName as last_name', 'gender as gender', \ 'level as level'] users_table = df.selectExpr(selection).dropDuplicates() # write users table to parquet files users_table.write.parquet(output_data + 'users/') # create timestamp column from original timestamp column get_datetime = udf(lambda x: datetime.fromtimestamp((x/1000.0)), TimestampType()) df = df.withColumn("start_time", get_datetime('ts')) # extract columns to create time table time_table = df.select('start_time').dropDuplicates() time_table = time_table.\ withColumn("hour", hour(time_table.start_time)).\ withColumn("day", dayofmonth(time_table.start_time)).\ withColumn("week", weekofyear(time_table.start_time)).\ withColumn("month", month(time_table.start_time)).\ withColumn("year", year(time_table.start_time)).\ withColumn("weekday", dayofweek(time_table.start_time)) # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month').parquet(output_data + 'time/') # read in song data to use for songplays table song_df = spark.read.parquet(output_data + 'songs/*/*/*') # extract columns from joined song and log datasets to create songplays table selection = ['songplay_id', 'start_time', \ 'userId as user_id', 'level', 'song_id', \ 'artist_id', 'sessionId as session_id', \ 'location', 'user_agent', \ 'year', 'month'] songplays_table = df1.join(songs_table, (df1.song == songs_table.title)).\ withColumn('songplay_id', monotonically_increasing_id()).\ withColumn("month", month(songplays_table.start_time)).\ withColumn("year", year(songplays_table.start_time)).\ selectExpr(selection) # write songplays table to parquet files partitioned by year and month songplays_table = songplays_table.write.partitionBy('year', 'month').parquet(output_data + 'songplays/')
def _create_from_pandas_with_arrow(self, pdf: "PandasDataFrameLike", schema: Union[StructType, List[str]], timezone: str) -> "DataFrame": """ Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the data types will be used to coerce the data in Pandas to Arrow conversion. """ from pyspark.sql import SparkSession from pyspark.sql.dataframe import DataFrame assert isinstance(self, SparkSession) from pyspark.sql.pandas.serializers import ArrowStreamPandasSerializer from pyspark.sql.types import TimestampType from pyspark.sql.pandas.types import from_arrow_type, to_arrow_type from pyspark.sql.pandas.utils import ( require_minimum_pandas_version, require_minimum_pyarrow_version, ) require_minimum_pandas_version() require_minimum_pyarrow_version() from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype import pyarrow as pa # Create the Spark schema from list of names passed in with Arrow types if isinstance(schema, (list, tuple)): arrow_schema = pa.Schema.from_pandas(pdf, preserve_index=False) struct = StructType() prefer_timestamp_ntz = is_timestamp_ntz_preferred() for name, field in zip(schema, arrow_schema): struct.add(name, from_arrow_type(field.type, prefer_timestamp_ntz), nullable=field.nullable) schema = struct # Determine arrow types to coerce data when creating batches if isinstance(schema, StructType): arrow_types = [to_arrow_type(f.dataType) for f in schema.fields] elif isinstance(schema, DataType): raise ValueError( "Single data type %s is not supported with Arrow" % str(schema)) else: # Any timestamps must be coerced to be compatible with Spark arrow_types = [ to_arrow_type(TimestampType()) if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None for t in pdf.dtypes ] # Slice the DataFrame to be batched step = -(-len(pdf) // self.sparkContext.defaultParallelism ) # round int up pdf_slices = (pdf.iloc[start:start + step] for start in range(0, len(pdf), step)) # Create list of Arrow (columns, type) for serializer dump_stream arrow_data = [[(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types) ] for pdf_slice in pdf_slices] jsqlContext = self._wrapped._jsqlContext # type: ignore[attr-defined] safecheck = self._wrapped._conf.arrowSafeTypeConversion( ) # type: ignore[attr-defined] col_by_name = True # col by name only applies to StructType columns, can't happen here ser = ArrowStreamPandasSerializer(timezone, safecheck, col_by_name) @no_type_check def reader_func(temp_filename): return self._jvm.PythonSQLUtils.readArrowStreamFromFile( jsqlContext, temp_filename) @no_type_check def create_RDD_server(): return self._jvm.ArrowRDDServer(jsqlContext) # Create Spark DataFrame from Arrow stream file, using one batch per partition jrdd = self._sc._serialize_to_jvm( # type: ignore[attr-defined] arrow_data, ser, reader_func, create_RDD_server) jdf = self._jvm.PythonSQLUtils.toDataFrame( # type: ignore[attr-defined] jrdd, schema.json(), jsqlContext) df = DataFrame(jdf, self._wrapped) df._schema = schema return df
def process_log_data(spark, input_data, output_data): """ Description: Load data from Log Dataset JSON files in S3, extract it into a DataFrame, then write DataFrame as parquet files back to S3 """ log_data = input_data + 'log_data/*.json' df = spark.read.json(log_data) df = df.filter(df.page == "NextSong") user_cols = [ "userId as user_id", "firstName as first_name", "lastName as last_name", "gender", "level" ] users_table = df.selectExpr(user_cols).dropDuplicates() users_table.write.parquet(output_data + "users/", mode="overwrite") # fromtimestamp function is required to be process in seconds hence, it is divided by 1000 get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000).isoformat()) df = df.withColumn("start_time", get_timestamp("ts").cast(TimestampType())) time_table = df.select("start_time") \ .withColumn("hour", F.hour("start_time")) \ .withColumn("day", F.dayofmonth("start_time")) \ .withColumn("week", F.weekofyear("start_time")) \ .withColumn("month", F.month("start_time")) \ .withColumn("year", F.year("start_time")) \ .withColumn("weekday", F.dayofweek("start_time")) time_table.write.partitionBy("year", "month").parquet(output_data + "time/", mode="overwrite") song_df = spark.read.json(input_data + 'song_data/*/*/*/*.json') df = df.orderBy("ts") df = df.withColumn("songplay_id", F.monotonically_increasing_id()) song_df.createOrReplaceTempView("staging_songs") df.createOrReplaceTempView("staging_events") songplays_table = spark.sql(""" SELECT se.songplay_id, se.start_time, se.userId as user_id, se.level, ss.song_id, se.sessionId as session_id, ss.artist_id, se.location, se.userAgent as user_agent, YEAR(se.start_time) as year, MONTH(se.start_time) as month FROM staging_events se LEFT JOIN staging_songs ss ON (se.song = ss.title AND se.artist = ss.artist_name) """) songplays_table.write.partitionBy("year", "month").parquet( output_data + "songplays/", mode="overwrite")
def create_test_scalar_dataset(tmp_url, num_rows, num_files=4, spark=None): shutdown = False if not spark: spark_session = SparkSession \ .builder \ .appName('petastorm_end_to_end_test') \ .master('local[*]') spark = spark_session.getOrCreate() shutdown = True expected_data = [{ 'id': np.int32(i), 'int_fixed_size_list': np.arange(1 + i, 10 + i).astype(np.int32), 'datetime': np.datetime64('2019-01-02'), 'timestamp': np.datetime64('2005-02-25T03:30'), 'string': np.unicode_('hello_{}'.format(i)), 'string2': np.unicode_('world_{}'.format(i)), 'float64': np.float64(i) * .66 } for i in range(num_rows)] expected_data_as_scalars = [{ k: np.asscalar(v) if isinstance(v, np.generic) else v for k, v in row.items() } for row in expected_data] # np.datetime64 is converted to a timezone unaware datetime instances. Working explicitly in UTC so we don't need # to think about local timezone in the tests for row in expected_data_as_scalars: row['timestamp'] = row['timestamp'].replace(tzinfo=pytz.UTC) row['int_fixed_size_list'] = row['int_fixed_size_list'].tolist() rows = [Row(**row) for row in expected_data_as_scalars] # WARNING: surprisingly, schema fields and row fields are matched only by order and not name. # We must maintain alphabetical order of the struct fields for the code to work!!! schema = StructType([ StructField('datetime', DateType(), False), StructField('float64', DoubleType(), False), StructField('id', IntegerType(), False), StructField('int_fixed_size_list', ArrayType(IntegerType(), False), False), StructField('string', StringType(), False), StructField('string2', StringType(), False), StructField('timestamp', TimestampType(), False), ]) dataframe = spark.createDataFrame(rows, schema) dataframe. \ coalesce(num_files). \ write.option('compression', 'none'). \ mode('overwrite'). \ parquet(tmp_url) if shutdown: spark.stop() return expected_data
def process_log_data(spark, input_data, output_data): """This function loads log_data from S3 and processes it by extracting the users and time dimension tables and songplays fact table then again loaded back to S3 Args: spark(:obj:`pyspark.sql.session.SparkSession`): SparkSession input_data (str): S3 bucket where song files are stored output (str): S3 bucket file path to store resulting files Returns: None """ print("**** Starting to process log data *****") # get filepath to log data file log_data = input_data+'log_data/*/*/*.json' # read log data file try: df =spark.read.json(log_data) except Exception as e: print(e) # filter by actions for song plays df = df.filter(df.page == "NextSong") # extract columns for users table users_fields = ["userId as user_id", "firstName as first_name", "lastName as last_name", "gender", "level","ts"] users_table = df.selectExpr(users_fields).orderBy("ts",ascending=False).dropDuplicates(["userId"]).drop("ts") # write users table to parquet files try: users_table.write.parquet(output_data + "users.parquet", mode="overwrite") except Exception as e: print(e) print("**** users table data load is complete *****") # create timestamp column from original timestamp column #get_timestamp = udf(date_convert, TimestampType()) #df = df.withColumn("datetime",get_timestamp(df.ts)) # create datetime column from original timestamp column get_datetime = udf(lambda ms: datetime.fromtimestamp(ms // 1000), TimestampType()) df = df.withColumn("datetime",get_timestamp(df.ts)) # extract columns to create time table time_fields = ["datetime as start_time", "hour(datetime) as hour", "dayofmonth(datetime) as day", "weekofyear(datetime) as week", "month(datetime) as month", "year(datetime) as year", "dayofweek(datetime) as weekday"] time_table = df.select(time_fields).dropDuplicates(["start_time"]) # write time table to parquet files partitioned by year and month try: time_table.write.parquet(output_data + "time.parquet", partitionBy=("year", "month"), mode="overwrite") except Exception as e: print(e) print("**** time table data load is complete *****") # read in song data to use for songplays table songs_df = spark.read.parquet(output_data + "songs.parquet") artists_df = spark.read.parquet(output_data + "artists.parquet") song_df = songs_df.join(artists_df.aslias("artists"), songs_df.artist_id == artists_df.artist_id , "inner" ).select("title", "name", "duration", "song_id", "artists.artist_id") # extract columns from joined song and log datasets to create songplays table songplays_table = df.join(song_df , (df.song == song_df.title) & (df.artist ==song_df.name) & (df.length == song_df.duration), "inner") songplays_table = songplays_table.withColumn("songplay_id",monotonically_increasing_id()) songplays_table = songplays_table.selectExpr("songplay_id", "datetime as start_time", "userId as user_id", "month(datetime) as month", "year(datetime) as year", "level", "song_id", "artist_id","sessionId as session_id", "location", "userAgent as user_agent").dropDuplicates() # write songplays table to parquet files partitioned by year and month try: songplays_table.write.parquet(output_data + "songplays.parquet", partitionBy=("year", "month"), mode="overwrite") except Exception as e: print(e) print("**** songplays table data load is complete *****") print("**** log data processing is finished *****")
from listenbrainz_spark.constants import LAST_FM_FOUNDING_YEAR from listenbrainz_spark.exceptions import HDFSException from listenbrainz_spark.path import LISTENBRAINZ_DATA_DIRECTORY from listenbrainz_spark.stats import (offset_days, offset_months, get_day_end, get_month_end, get_year_end, replace_days, replace_months, run_query) from listenbrainz_spark.stats.utils import (filter_listens, get_last_monday, get_latest_listen_ts) from listenbrainz_spark.utils import get_listens from pyspark.sql.functions import collect_list, sort_array, struct, lit from pyspark.sql.types import (StringType, StructField, StructType, TimestampType) time_range_schema = StructType((StructField('time_range', StringType()), StructField( 'start', TimestampType()), StructField('end', TimestampType()))) def get_listening_activity(): """ Calculate number of listens for each user in time ranges given in the 'time_range' table """ # Calculate the number of listens in each time range for each user except the time ranges which have zero listens. result_without_zero_days = run_query(""" SELECT listens.user_name , time_range.time_range , count(listens.user_name) as listen_count FROM listens JOIN time_range ON listens.listened_at >= time_range.start AND listens.listened_at <= time_range.end GROUP BY listens.user_name , time_range.time_range
def get_hrv_features(rr_data, acceptable_percentage=50, window_length=60): """ Args: rr_data (DataStream): acceptable_percentage (int): window_length (int): Returns: """ stream_name = 'org.md2k.autosense.ecg.features' def get_metadata(): stream_metadata = Metadata() stream_metadata.set_name(stream_name).set_description("HRV Features from ECG RR interval") \ .add_input_stream(rr_data.metadata.get_name()) \ .add_dataDescriptor( DataDescriptor() .set_name("var") .set_type("double") .set_attribute("description","variance")) \ .add_dataDescriptor( DataDescriptor() .set_name("iqr") .set_type("double") .set_attribute("description","Inter Quartile Range")) \ .add_dataDescriptor( DataDescriptor() .set_name("mean") .set_type("double") .set_attribute("description","Mean RR Interval")) \ .add_dataDescriptor( DataDescriptor() .set_name("median") .set_type("double") .set_attribute("description","Median RR Interval")) \ .add_dataDescriptor( DataDescriptor() .set_name("80th") .set_type("double") .set_attribute("description","80th percentile RR Interval")) \ .add_dataDescriptor( DataDescriptor() .set_name("20th") .set_type("double") .set_attribute("description","20th percentile RR Interval")) \ .add_dataDescriptor( DataDescriptor() .set_name("heartrate") .set_type("double") .set_attribute("description","Heart Rate in BPM")) \ .add_dataDescriptor( DataDescriptor() .set_name("vlf") .set_type("double") .set_attribute("description","Very Low Frequency Energy")) \ .add_dataDescriptor( DataDescriptor() .set_name("lf") .set_type("double") .set_attribute("description","Low Frequency Energy")) \ .add_dataDescriptor( DataDescriptor() .set_name("hf") .set_type("double") .set_attribute("description","High Frequency Energy")) \ .add_dataDescriptor( DataDescriptor() .set_name("lfhf") .set_type("double") .set_attribute("description","Low frequency to High Frequency energy ratio")) \ .add_dataDescriptor( DataDescriptor() .set_name("window") .set_type("struct") .set_attribute("description","window start and end time in UTC") .set_attribute('start','start of window') .set_attribute('end','end of window')) \ .add_module( ModuleMetadata().set_name("HRV Features from ECG RR Interval") .set_attribute("url", "http://md2k.org/") .set_attribute('algorithm','ecg feature computation') .set_attribute('unit','ms') .set_author("Md Azim Ullah", "*****@*****.**")) return stream_metadata def get_rr_features(a): return np.array([ np.var(a), iqr(a), np.mean(a), np.median(a), np.percentile(a, 80), np.percentile(a, 20), 60000 / np.median(a) ]) def frequencyDomain(RRints, tmStamps, band_type=None, lf_bw=0.11, hf_bw=0.1, vlf=(0.003, 0.04), lf=(0.04, 0.15), hf=(0.15, 0.4)): """ Args: RRints: tmStamps: band_type: lf_bw: hf_bw: vlf: lf: hf: Returns: """ NNs = RRints tss = tmStamps frequency_range = np.linspace(0.001, 1, 10000) NNs = np.array(NNs) NNs = NNs - np.mean(NNs) result = signal.lombscargle(tss, NNs, frequency_range) #Pwelch w/ zero pad fxx = frequency_range pxx = result if band_type == 'adapted': vlf_peak = fxx[np.where(pxx == np.max(pxx[np.logical_and( fxx >= vlf[0], fxx < vlf[1])]))[0][0]] lf_peak = fxx[np.where(pxx == np.max(pxx[np.logical_and( fxx >= lf[0], fxx < lf[1])]))[0][0]] hf_peak = fxx[np.where(pxx == np.max(pxx[np.logical_and( fxx >= hf[0], fxx < hf[1])]))[0][0]] peak_freqs = (vlf_peak, lf_peak, hf_peak) hf = (peak_freqs[2] - hf_bw / 2, peak_freqs[2] + hf_bw / 2) lf = (peak_freqs[1] - lf_bw / 2, peak_freqs[1] + lf_bw / 2) vlf = (0.003, lf[0]) if lf[0] < 0: print( '***Warning***: Adapted LF band lower bound spills into negative frequency range' ) print('Lower thresold of LF band has been set to zero') print('Adjust LF and HF bandwidths accordingly') lf = (0, lf[1]) vlf = (0, 0) elif hf[0] < 0: print( '***Warning***: Adapted HF band lower bound spills into negative frequency range' ) print('Lower thresold of HF band has been set to zero') print('Adjust LF and HF bandwidths accordingly') hf = (0, hf[1]) lf = (0, 0) vlf = (0, 0) df = fxx[1] - fxx[0] vlf_power = np.trapz(pxx[np.logical_and(fxx >= vlf[0], fxx < vlf[1])], dx=df) lf_power = np.trapz(pxx[np.logical_and(fxx >= lf[0], fxx < lf[1])], dx=df) hf_power = np.trapz(pxx[np.logical_and(fxx >= hf[0], fxx < hf[1])], dx=df) totalPower = vlf_power + lf_power + hf_power #Normalize and take log vlf_NU_log = np.log((vlf_power / (totalPower - vlf_power)) + 1) lf_NU_log = np.log((lf_power / (totalPower - vlf_power)) + 1) hf_NU_log = np.log((hf_power / (totalPower - vlf_power)) + 1) lfhfRation_log = np.log((lf_power / hf_power) + 1) freqDomainFeats = { 'VLF_Power': vlf_NU_log, 'LF_Power': lf_NU_log, 'HF_Power': hf_NU_log, 'LF/HF': lfhfRation_log } return freqDomainFeats schema = StructType([ StructField("timestamp", TimestampType()), StructField("start", TimestampType()), StructField("end", TimestampType()), StructField("localtime", TimestampType()), StructField("version", IntegerType()), StructField("user", StringType()), StructField("features", ArrayType(DoubleType())) ]) @pandas_udf(schema, PandasUDFType.GROUPED_MAP) @CC_MProvAgg('org.md2k.autosense.ecg.rr', 'get_hrv_features', stream_name, ['user', 'timestamp'], ['user', 'timestamp']) def ecg_r_peak(key, data): """ Args: key: data: Returns: """ if data.shape[0] >= acceptable_percentage * window_length / 100: data = data.sort_values('time') data['time'] = 1000 * data['time'] a = data['rr'].values features = [ np.double( np.array( list(get_rr_features(a)) + list( frequencyDomain( np.array(a) / 1000, np.cumsum(a) / 1000).values()))) ] data = data[:1] data['features'] = features data['start'] = [key[2]['start']] data['end'] = [key[2]['end']] data = data[[ 'timestamp', 'localtime', 'version', 'user', 'start', 'end', 'features' ]] return data else: return pd.DataFrame([], columns=[ 'timestamp', 'localtime', 'version', 'user', 'features', 'start', 'end' ]) rr_data = rr_data.withColumn('time', F.col('timestamp').cast('double')) ecg_features = rr_data.compute(ecg_r_peak, windowDuration=window_length, startTime='0 seconds') df = ecg_features.select('timestamp', F.struct('start', 'end').alias('window'), 'localtime', 'features', 'user', 'version') df = df.withColumn('var', F.col('features').getItem(0)) df = df.withColumn('iqr', F.col('features').getItem(1)) df = df.withColumn('vlf', F.col('features').getItem(7)) df = df.withColumn('lf', F.col('features').getItem(8)) df = df.withColumn('hf', F.col('features').getItem(9)) df = df.withColumn('lfhf', F.col('features').getItem(10)) df = df.withColumn('mean', F.col('features').getItem(2)) df = df.withColumn('median', F.col('features').getItem(3)) df = df.withColumn('80th', F.col('features').getItem(4)) df = df.withColumn('20th', F.col('features').getItem(5)) ecg_features_final = df.withColumn('heartrate', F.col('features').getItem(6)) ecg_features_final = ecg_features_final.drop('features') feature_names = [ 'var', 'iqr', 'mean', 'median', '80th', '20th', 'heartrate', 'vlf', 'lf', 'hf', 'lfhf' ] stress_features = ecg_features_final.withColumn( 'features', F.array([F.col(i) for i in feature_names])) stress_features.metadata = get_metadata() return stress_features
from pyspark.sql.types import ( DateType, DoubleType, IntegerType, StringType, StructField, StructType, TimestampType, ) raw = StructType([StructField("value", StringType(), False)]) bronze = StructType([ StructField("datasource", StringType(), False), StructField("ingesttime", TimestampType(), False), StructField("value", StringType(), True), StructField("p_ingestdate", DateType(), False), ]) silver = StructType([ StructField("device_id", IntegerType(), True), StructField("device_type", StringType(), True), StructField("heartrate", DoubleType(), True), StructField("eventtime", TimestampType(), True), StructField("name", StringType(), True), StructField("p_eventdate", DateType(), True), ])
def test_verify_type_not_nullable(self): import array import datetime import decimal schema = StructType([ StructField('s', StringType(), nullable=False), StructField('i', IntegerType(), nullable=True) ]) class MyObj: def __init__(self, **kwargs): for k, v in kwargs.items(): setattr(self, k, v) # obj, data_type success_spec = [ # String ("", StringType()), (u"", StringType()), (1, StringType()), (1.0, StringType()), ([], StringType()), ({}, StringType()), # UDT (ExamplePoint(1.0, 2.0), ExamplePointUDT()), # Boolean (True, BooleanType()), # Byte (-(2**7), ByteType()), (2**7 - 1, ByteType()), # Short (-(2**15), ShortType()), (2**15 - 1, ShortType()), # Integer (-(2**31), IntegerType()), (2**31 - 1, IntegerType()), # Long (-(2**63), LongType()), (2**63 - 1, LongType()), # Float & Double (1.0, FloatType()), (1.0, DoubleType()), # Decimal (decimal.Decimal("1.0"), DecimalType()), # Binary (bytearray([1, 2]), BinaryType()), # Date/Timestamp (datetime.date(2000, 1, 2), DateType()), (datetime.datetime(2000, 1, 2, 3, 4), DateType()), (datetime.datetime(2000, 1, 2, 3, 4), TimestampType()), # Array ([], ArrayType(IntegerType())), (["1", None], ArrayType(StringType(), containsNull=True)), ([1, 2], ArrayType(IntegerType())), ((1, 2), ArrayType(IntegerType())), (array.array('h', [1, 2]), ArrayType(IntegerType())), # Map ({}, MapType(StringType(), IntegerType())), ({ "a": 1 }, MapType(StringType(), IntegerType())), ({ "a": None }, MapType(StringType(), IntegerType(), valueContainsNull=True)), # Struct ({ "s": "a", "i": 1 }, schema), ({ "s": "a", "i": None }, schema), ({ "s": "a" }, schema), ({ "s": "a", "f": 1.0 }, schema), (Row(s="a", i=1), schema), (Row(s="a", i=None), schema), (["a", 1], schema), (["a", None], schema), (("a", 1), schema), (MyObj(s="a", i=1), schema), (MyObj(s="a", i=None), schema), (MyObj(s="a"), schema), ] # obj, data_type, exception class failure_spec = [ # String (match anything but None) (None, StringType(), ValueError), # UDT (ExamplePoint(1.0, 2.0), PythonOnlyUDT(), ValueError), # Boolean (1, BooleanType(), TypeError), ("True", BooleanType(), TypeError), ([1], BooleanType(), TypeError), # Byte (-(2**7) - 1, ByteType(), ValueError), (2**7, ByteType(), ValueError), ("1", ByteType(), TypeError), (1.0, ByteType(), TypeError), # Short (-(2**15) - 1, ShortType(), ValueError), (2**15, ShortType(), ValueError), # Integer (-(2**31) - 1, IntegerType(), ValueError), (2**31, IntegerType(), ValueError), # Float & Double (1, FloatType(), TypeError), (1, DoubleType(), TypeError), # Decimal (1.0, DecimalType(), TypeError), (1, DecimalType(), TypeError), ("1.0", DecimalType(), TypeError), # Binary (1, BinaryType(), TypeError), # Date/Timestamp ("2000-01-02", DateType(), TypeError), (946811040, TimestampType(), TypeError), # Array (["1", None], ArrayType(StringType(), containsNull=False), ValueError), ([1, "2"], ArrayType(IntegerType()), TypeError), # Map ({ "a": 1 }, MapType(IntegerType(), IntegerType()), TypeError), ({ "a": "1" }, MapType(StringType(), IntegerType()), TypeError), ({ "a": None }, MapType(StringType(), IntegerType(), valueContainsNull=False), ValueError), # Struct ({ "s": "a", "i": "1" }, schema, TypeError), (Row(s="a"), schema, ValueError), # Row can't have missing field (Row(s="a", i="1"), schema, TypeError), (["a"], schema, ValueError), (["a", "1"], schema, TypeError), (MyObj(s="a", i="1"), schema, TypeError), (MyObj(s=None, i="1"), schema, ValueError), ] # Check success cases for obj, data_type in success_spec: try: _make_type_verifier(data_type, nullable=False)(obj) except Exception: self.fail("verify_type(%s, %s, nullable=False)" % (obj, data_type)) # Check failure cases for obj, data_type, exp in failure_spec: msg = "verify_type(%s, %s, nullable=False) == %s" % ( obj, data_type, exp) with self.assertRaises(exp, msg=msg): _make_type_verifier(data_type, nullable=False)(obj)
def process_log_data(spark, input_data, output_data): """ Processes all log data JSON files in the given input folder and stores them in parquet format in the output folder. :param spark: spark session :param input_data: input data path :param output_data: output data path """ # get filepath to log data file log_data = os.path.join(input_data, 'log_data/*/*/*.json') # read log data file log_df = spark.read.json(log_data) # filter by actions for song plays log_df = log_df.filter(log_df.page == 'Nextsong') # extract columns for users table users_fields = [ "userId as user_id", "firstName as first_name", "lastName as last_name", "gender", "level" ] users_table = log_df.selectExpr(user_fields).dropDuplicates() # write user table to parquet files users_table.writen.mode("overwrite").parquet(output_data + 'users') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: x / 1000, TimestampType()) log_df = log_df.withColumn("timestamp", get_timestamp(log_df.ts)) # create datetime column from original timestamp column get_datetime = udf(lambda x: datetime.fromtimestamp(x), TimestampType()) log_df = log_df.withColumn("start_time", get_datetime(log_df.timestamp)) # extract columns to create time table log_df = log_df.withColumn("hour", hour("start_time")) \ .withColumn("day", dayofmonth("start_time")) \ .withColumn("week", weekofyear("start_time")) \ .withColumn("month", month("start_time")) \ .withColumn("year", year("start_time")) \ .withColumn("weekday", dayofweek("start_time")) time_table = log_df.select("start_time", "hour", "day", "week", "month", "year", "weekday") # write time table to parquet files partitioned by year and month time_table.write.mode("overwrite").partitionBy( "year", "month").parquet(output_data + "time") # read in song data to use for songplays table songs_df = spark.read.parquet(os.path.join(output_data, "songs/*/*/*")) songs_logs = log_df.join(songs_df, (log_df.songs == songs_df.title)) # extract columns from joined song and log datasets to create songplays table artists_df = spark.read.parquet(os.path.join(output_data, "artists")) artists_songs_logs = songs_logs.join( songs_df, (songs_logs.artist == artists_df.name)) songplays = artists_songs_logs.join(time_table, artists_songs_logs.ts == time_table.ts, 'left').drop(artists_songs_logs.year) # write songplays table to parquet files partitioned by year and month songplays_table = songplays.select( col('start_time'), col('userId').alias('user_id'), col('level'), col('song_id'), col('artist_id'), col('sessionId').alias('session_id'), col('location'), col('userAgent').alias('user_agent'), col('year'), col('month'), ).repartition("year", "month") songplays_table.write.mode("overwrite").partitionBy( "year", "month").parquet(output_data + 'songplays')
def process_log_data(spark, input_data, output_data): """This function will get the data from Udacity s3 bucket available for this project. Extract data from the log_data path, select the columns that the project requires and create the output tables in parquet files for the artist and song table.""" # get filepath to log data file log_data = input_data + "log-data" # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(col("page")=='NextSong').filter(df.userId.isNotNull()) # extract columns for users table users_table = df.select(col("userId").alias("user_id"), col("firstName").alias("first_name"), col("lastName").alias("last_name"), "gender", "level").dropDuplicates() # write users table to parquet files print("""##### [STARTING] Writing table to the parquet files: USERS ##### """) users_table.write.mode("overwrite").parquet(output_data+"users") print("""##### [FINISHED] Table USERS already loaded ##### """) # create timestamp column from original timestamp column get_timestamp = udf(lambda x: str(int(int(x) / 1000))) df = df.withColumn("timestamp",get_timestamp(col("ts"))) # create datetime column from original timestamp column get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000.0))) df = df.withColumn("datetime", get_datetime(col("ts"))) # extract columns to create time table time_table = df.select( 'timestamp', hour('datetime').alias('hour'), dayofmonth('datetime').alias('day'), weekofyear('datetime').alias('week'), month('datetime').alias('month'), year('datetime').alias('year'), date_format('datetime', 'F').alias('weekday') ) # write time table to parquet files partitioned by year and month print("""##### [STARTING] Writing table to the parquet files: TIME ##### """) time_table.write.mode("overwrite").partitionBy("year","month").parquet(output_data+"time") print("""##### [FINISHED] Table TIME already loaded ##### """) # read in song data to use for songplays table song_data = input_data + "song_data/*/*/*/*.json" song_df = spark.read.json(song_data) # extract columns from joined song and log datasets to create songplays table # Creating a string variable with the timestamp format tsFormatVar = "yyyy/MM/dd HH:MM:ss z" '''In this part, the songplays table are made with a join between 2 dataframes, and after that the columns select with the transformation (if the transformation applies)''' songplays_table = song_df.join(df,(song_df.artist_name==df.artist) & (song_df.title==df.song)).withColumn("songplay_id",monotonically_increasing_id()).withColumn('start_time', to_timestamp(date_format((col("ts") /1000).cast(dataType=TimestampType()), tsFormatVar),tsFormatVar)).select("songplay_id","start_time",col("userId").alias("user_id"),"level","song_id","artist_id",col("sessionId").alias("session_id"),col("artist_location").alias("location"),"userAgent",month(col("start_time")).alias("month"),year(col("start_time")).alias("year")) # write songplays table to parquet files partitioned by year and month print("""##### [STARTING] Writing table to the parquet files: SONGPLAYS ##### """) songplays_table.write.mode("overwrite").partitionBy("year","month").parquet(output_data+"songplays") print("""##### [FINISHED] Table SONGPLAYS already loaded ##### """)
def process_log_data(spark, input_data, output_data): # get filepath to log data file print("Reading logs Data Files") log_data = input_data + 'log_data/*/*/*.json' # # read log data file schema = StructType([ StructField('artist', StringType()), StructField('auth', StringType()), StructField('firstName', StringType()), StructField('gender', StringType()), StructField('itemInSession', IntegerType()), StructField('lastName', StringType()), StructField('length', DoubleType()), StructField('level', StringType()), StructField('location', StringType()), StructField('method', StringType()), StructField('page', StringType()), StructField('registration', StringType()), StructField('sessionId', IntegerType()), StructField('song', StringType()), StructField('status', IntegerType()), StructField('ts', IntegerType()), StructField('userAgent', StringType()), StructField('userId', IntegerType()) ]) # when applying schema to log_data files, it is not working thatswhy kept without schema #df = spark.read.schema(schema).json(log_data) df = spark.read.json(log_data) # extract columns for users table users_table = df.select('userId', 'firstName', 'lastName', 'gender', 'level') users_table.select('userId', 'firstName', 'lastName', 'gender', 'level').dropDuplicates().collect() users_table.filter(users_table.userId != " ").count() users_table=users_table.withColumnRenamed('userId','user_id') \ .withColumnRenamed('firstName','first_name') \ .withColumnRenamed('lastName','last_name') # write users table to parquet files users_table.write.parquet(output_data + "/users.parquet", mode='overwrite', compression='snappy') # create datetime column from original timestamp column get_datetime = udf(lambda x: datetime.fromtimestamp(x / 1000).datetime) df = df.withColumn("datetime", get_datetime(col("ts"))) get_hour = udf(lambda x: datetime.fromtimestamp(x / 1000.0).hour) df = df.withColumn("hour", get_hour(df.ts)) # create day column from datetime get_day = udf(lambda x: datetime.fromtimestamp(x / 1000.0).day) df = df.withColumn("day", get_day(df.ts)) # create week column from datetime get_week = udf( lambda x: datetime.fromtimestamp(x / 1000.0).isocalendar()[1]) df = df.withColumn("week", get_week(df.ts)) # create month column from datetime get_month = udf(lambda x: datetime.fromtimestamp(x / 1000.0).month) df = df.withColumn("month", get_month(df.ts)) # create year column from datetime get_year = udf(lambda x: datetime.fromtimestamp(x / 1000.0).year) df = df.withColumn("year", get_year(df.ts)) # create weekday column from datetime get_weekday = udf(lambda x: datetime.fromtimestamp(x / 1000.0).weekday()) df = df.withColumn("weekday", get_weekday(df.ts)) # extract columns to create time table time_table = df.select( ["ts", "hour", "day", "week", "month", "year", "weekday"]) print('--- Saving time_table') # write time table to parquet files partitioned by year and month time_table.write.mode('append').partitionBy( 'year', 'month').parquet(output_data + "time_data") # read in song data to use for songplays table song_data = os.path.join(input_data, 'song_data/A/A/*/*.json') song_df = spark.read.json(song_data) print('--Preparing Songs Play Table--') #this UDF is used for conevrt ts into time stamp filed. get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000), TimestampType()) getsongplays_table = df.join(song_df, (df.artist == song_df.artist_name) & (df.song == song_df.title),'inner') \ .withColumn('start_time', get_timestamp(df.ts))\ .withColumn("songplay_id", monotonically_increasing_id()) # extract columns from joined song and log datasets to create songplays table songplays_table = getsongplays_table.selectExpr( ['songplay_id', 'start_time', 'userId as user_id', 'level', 'song_id', 'artist_id', 'sessionId as session_id', 'location', 'userAgent as user_agent']) \ .withColumn('year', year('start_time')) \ .withColumn('month', month('start_time')) #write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year", "month").parquet( output_data + "/songplays.parquet", mode='overwrite', compression='snappy')
"body_ic", "body_iv", "body_cu", ] # End of column definition # Start of schema definition session_schema = StructType([ StructField("fullVisitorId", StringType(), True), StructField("visitId", StringType(), True), StructField("userId", StringType(), True), StructField("visitNumber", IntegerType(), True), StructField("visitStartTime", LongType(), True), StructField("date", IntegerType(), True), StructField("timestamp", TimestampType(), True), StructField("trafficSource_campaign", StringType(), True), StructField("trafficSource_source", StringType(), True), StructField("trafficSource_medium", StringType(), True), StructField("trafficSource_keyword", StringType(), True), StructField("trafficSource_ad_content", StringType(), True), StructField("totals_transactionRevenue", StringType(), True), StructField("landingPage", StringType(), True), StructField("hits_type", StringType(), True), StructField("touchpoints", ArrayType(StringType()), True), StructField("touchpoints_wo_direct", ArrayType(StringType()), True), StructField("first_touchpoint", StringType(), True), StructField("last_touchpoint", StringType(), True) ]) ga_fields = {
#-*-coding:utf-8-*- from pyspark.sql.types import IntegerType, TimestampType from pyspark.sql.functions import * from base import spark from utils import uuidsha columns = [ col('docu_dk').alias('alrt_docu_dk'), col('docu_nr_mp').alias('alrt_docu_nr_mp'), col('dt_fim_prazo').alias('alrt_date_referencia').cast(TimestampType()), col('docu_orgi_orga_dk_responsavel').alias('alrt_orgi_orga_dk'), col('elapsed').alias('alrt_dias_referencia'), col('nm_delegacia').alias('alrt_info_adicional'), col('alrt_key') ] key_columns = [ col('docu_dk'), col('dt_fim_prazo') ] def alerta_bdpa(options): documento = spark.sql("from documento").filter('DOCU_TPST_DK = 3').filter('DOCU_FSDC_DK = 1') orga_externo = spark.table('%s.mprj_orgao_ext' % options['schema_exadata']).\ withColumnRenamed('ORGE_NM_ORGAO', 'nm_delegacia') doc_origem = documento.join( orga_externo, documento.DOCU_ORGE_ORGA_DK_DELEG_ORIGEM == orga_externo.ORGE_ORGA_DK, 'left'