def test_datatype(self): first = T.StructType([ T.StructField('f1', T.BooleanType()), T.StructField('f2', T.ByteType()), T.StructField('f3', T.IntegerType()), T.StructField('f4', T.LongType()), ]) second = T.StructType([ T.StructField('f3', T.IntegerType()), T.StructField('f2', T.ByteType()), T.StructField('f4', T.LongType()), T.StructField('f1', T.BooleanType()), ]) SparklyTest().assertRowsEqual(first, second, ignore_order=True) with self.assertRaises(AssertionError): self.assertEqual(first, second) # change entry (f4, T.LongType) second = T.StructType([ T.StructField('f3', T.IntegerType()), T.StructField('f2', T.ByteType()), T.StructField('f4', T.StringType()), T.StructField('f1', T.BooleanType()), ]) with self.assertRaises(AssertionError): SparklyTest().assertRowsEqual(first, second, ignore_order=True)
def _numpy_to_spark_mapping(): """Returns a mapping from numpy to pyspark.sql type. Caches the mapping dictionary inorder to avoid instantiation of multiple objects in each call.""" # Refer to the attribute of the function we use to cache the map using a name in the variable instead of a 'dot' # notation to avoid copy/paste/typo mistakes cache_attr_name = 'cached_numpy_to_pyspark_types_map' if not hasattr(_numpy_to_spark_mapping, cache_attr_name): import pyspark.sql.types as T setattr(_numpy_to_spark_mapping, cache_attr_name, { np.int8: T.ByteType(), np.uint8: T.ShortType(), np.int16: T.ShortType(), np.uint16: T.IntegerType(), np.int32: T.IntegerType(), np.int64: T.LongType(), np.float32: T.FloatType(), np.float64: T.DoubleType(), np.string_: T.StringType(), np.str_: T.StringType(), np.unicode_: T.StringType(), np.bool_: T.BooleanType(), }) return getattr(_numpy_to_spark_mapping, cache_attr_name)
def as_spark_type(tpe) -> types.DataType: """ Given a python type, returns the equivalent spark type. Accepts: - the built-in types in python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - python3's typing system """ if tpe in (str, "str", "string"): return types.StringType() elif tpe in (bytes, ): return types.BinaryType() elif tpe in (np.int8, "int8", "byte"): return types.ByteType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() elif tpe in (int, "int", np.int, np.int32): return types.IntegerType() elif tpe in (np.int64, "int64", "long", "bigint"): return types.LongType() elif tpe in (float, "float", np.float): return types.FloatType() elif tpe in (np.float64, "float64", "double"): return types.DoubleType() elif tpe in (datetime.datetime, np.datetime64): return types.TimestampType() elif tpe in (datetime.date, ): return types.DateType() elif tpe in (bool, "boolean", "bool", np.bool): return types.BooleanType() elif tpe in (): return types.ArrayType(types.StringType())
def infer_spark_type(typeclass) -> t.DataType: if typeclass in (None, NoneType): return t.NullType() elif typeclass is str: return t.StringType() elif typeclass in {bytes, bytearray}: return t.BinaryType() elif typeclass is bool: return t.BooleanType() elif typeclass is date: return t.DateType() elif typeclass is datetime: return t.TimestampType() elif typeclass is Decimal: return t.DecimalType(precision=36, scale=6) elif isinstance(typeclass, type) and issubclass(typeclass, BoundDecimal): (precision, scale) = typeclass.__constraints__ return t.DecimalType(precision=precision, scale=scale) elif typeclass is float: return t.DoubleType() elif typeclass is int: return t.IntegerType() elif typeclass is long: return t.LongType() elif typeclass is short: return t.ShortType() elif typeclass is byte: return t.ByteType() elif getattr(typeclass, "__origin__", None) is not None: return infer_complex_spark_type(typeclass) elif is_pyspark_class(typeclass): return transform(typeclass) else: raise TypeError(f"Don't know how to represent {typeclass} in Spark")
def as_spark_type(tpe) -> types.DataType: """ Given a Python type, returns the equivalent spark type. Accepts: - the built-in types in Python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - Python3's typing system """ # TODO: Add "boolean" and "string" types. # ArrayType if tpe in (np.ndarray,): return types.ArrayType(types.StringType()) elif hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, list): return types.ArrayType(as_spark_type(tpe.__args__[0])) # BinaryType elif tpe in (bytes, np.character, np.bytes_, np.string_): return types.BinaryType() # BooleanType elif tpe in (bool, np.bool, "bool", "?"): return types.BooleanType() # DateType elif tpe in (datetime.date,): return types.DateType() # NumericType elif tpe in (np.int8, np.byte, "int8", "byte", "b"): return types.ByteType() elif tpe in (decimal.Decimal,): # TODO: considering about the precision & scale for decimal type. return types.DecimalType(38, 18) elif tpe in (float, np.float, np.float64, "float", "float64", "double"): return types.DoubleType() elif tpe in (np.float32, "float32", "f"): return types.FloatType() elif tpe in (np.int32, "int32", "i"): return types.IntegerType() elif tpe in (int, np.int, np.int64, "int", "int64", "long", "bigint"): return types.LongType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() # StringType elif tpe in (str, np.unicode_, "str", "U"): return types.StringType() # TimestampType elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"): return types.TimestampType() else: raise TypeError("Type %s was not understood." % tpe)
def as_spark_type(tpe) -> types.DataType: """ Given a python type, returns the equivalent spark type. Accepts: - the built-in types in python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - python3's typing system """ if tpe in (str, "str", "string"): return types.StringType() elif tpe in (bytes, ): return types.BinaryType() elif tpe in (np.int8, "int8", "byte"): return types.ByteType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() elif tpe in (int, "int", np.int, np.int32): return types.IntegerType() elif tpe in (np.int64, "int64", "long", "bigint"): return types.LongType() elif tpe in (float, "float", np.float): return types.FloatType() elif tpe in (np.float64, "float64", "double"): return types.DoubleType() elif tpe in (decimal.Decimal, ): return types.DecimalType(38, 18) elif tpe in (datetime.datetime, np.datetime64): return types.TimestampType() elif tpe in (datetime.date, ): return types.DateType() elif tpe in (bool, "boolean", "bool", np.bool): return types.BooleanType() elif tpe in (np.ndarray, ): # TODO: support other child types return types.ArrayType(types.StringType()) else: raise TypeError("Type %s was not understood." % tpe)
def get_common_spark_testing_client(data_directory, connect): pytest.importorskip('pyspark') import pyspark.sql.types as pt from pyspark.sql import SparkSession spark = (SparkSession.builder.config('spark.default.parallelism', 4).config('spark.driver.bindAddress', '127.0.0.1').getOrCreate()) _spark_testing_client = connect(spark) s = _spark_testing_client._session num_partitions = 4 df_functional_alltypes = ( s.read.csv( path=str(data_directory / 'functional_alltypes.csv'), schema=pt.StructType([ pt.StructField('index', pt.IntegerType(), True), pt.StructField('Unnamed: 0', pt.IntegerType(), True), pt.StructField('id', pt.IntegerType(), True), # cast below, Spark can't read 0/1 as bool pt.StructField('bool_col', pt.ByteType(), True), pt.StructField('tinyint_col', pt.ByteType(), True), pt.StructField('smallint_col', pt.ShortType(), True), pt.StructField('int_col', pt.IntegerType(), True), pt.StructField('bigint_col', pt.LongType(), True), pt.StructField('float_col', pt.FloatType(), True), pt.StructField('double_col', pt.DoubleType(), True), pt.StructField('date_string_col', pt.StringType(), True), pt.StructField('string_col', pt.StringType(), True), pt.StructField('timestamp_col', pt.TimestampType(), True), pt.StructField('year', pt.IntegerType(), True), pt.StructField('month', pt.IntegerType(), True), ]), mode='FAILFAST', header=True, ).repartition(num_partitions).sort('index')) df_functional_alltypes = df_functional_alltypes.withColumn( "bool_col", df_functional_alltypes["bool_col"].cast("boolean")) df_functional_alltypes.createOrReplaceTempView('functional_alltypes') df_batting = (s.read.csv( path=str(data_directory / 'batting.csv'), schema=pt.StructType([ pt.StructField('playerID', pt.StringType(), True), pt.StructField('yearID', pt.IntegerType(), True), pt.StructField('stint', pt.IntegerType(), True), pt.StructField('teamID', pt.StringType(), True), pt.StructField('lgID', pt.StringType(), True), pt.StructField('G', pt.IntegerType(), True), pt.StructField('AB', pt.DoubleType(), True), pt.StructField('R', pt.DoubleType(), True), pt.StructField('H', pt.DoubleType(), True), pt.StructField('X2B', pt.DoubleType(), True), pt.StructField('X3B', pt.DoubleType(), True), pt.StructField('HR', pt.DoubleType(), True), pt.StructField('RBI', pt.DoubleType(), True), pt.StructField('SB', pt.DoubleType(), True), pt.StructField('CS', pt.DoubleType(), True), pt.StructField('BB', pt.DoubleType(), True), pt.StructField('SO', pt.DoubleType(), True), pt.StructField('IBB', pt.DoubleType(), True), pt.StructField('HBP', pt.DoubleType(), True), pt.StructField('SH', pt.DoubleType(), True), pt.StructField('SF', pt.DoubleType(), True), pt.StructField('GIDP', pt.DoubleType(), True), ]), header=True, ).repartition(num_partitions).sort('playerID')) df_batting.createOrReplaceTempView('batting') df_awards_players = (s.read.csv( path=str(data_directory / 'awards_players.csv'), schema=pt.StructType([ pt.StructField('playerID', pt.StringType(), True), pt.StructField('awardID', pt.StringType(), True), pt.StructField('yearID', pt.IntegerType(), True), pt.StructField('lgID', pt.StringType(), True), pt.StructField('tie', pt.StringType(), True), pt.StructField('notes', pt.StringType(), True), ]), header=True, ).repartition(num_partitions).sort('playerID')) df_awards_players.createOrReplaceTempView('awards_players') df_simple = s.createDataFrame([(1, 'a')], ['foo', 'bar']) df_simple.createOrReplaceTempView('simple') df_struct = s.createDataFrame([((1, 2, 'a'), )], ['struct_col']) df_struct.createOrReplaceTempView('struct') df_nested_types = s.createDataFrame( [([1, 2], [[3, 4], [5, 6]], { 'a': [[2, 4], [3, 5]] })], [ 'list_of_ints', 'list_of_list_of_ints', 'map_string_list_of_list_of_ints', ], ) df_nested_types.createOrReplaceTempView('nested_types') df_complicated = s.createDataFrame([({ (1, 3): [[2, 4], [3, 5]] }, )], ['map_tuple_list_of_list_of_ints']) df_complicated.createOrReplaceTempView('complicated') df_udf = s.createDataFrame( [('a', 1, 4.0, 'a'), ('b', 2, 5.0, 'a'), ('c', 3, 6.0, 'b')], ['a', 'b', 'c', 'key'], ) df_udf.createOrReplaceTempView('udf') df_udf_nan = s.createDataFrame( pd.DataFrame({ 'a': np.arange(10, dtype=float), 'b': [3.0, np.NaN] * 5, 'key': list('ddeefffggh'), })) df_udf_nan.createOrReplaceTempView('udf_nan') df_udf_null = s.createDataFrame( [(float(i), None if i % 2 else 3.0, 'ddeefffggh'[i]) for i in range(10)], ['a', 'b', 'key'], ) df_udf_null.createOrReplaceTempView('udf_null') df_udf_random = s.createDataFrame( pd.DataFrame({ 'a': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), 'b': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), 'key': list('ddeefff'), })) df_udf_random.createOrReplaceTempView('udf_random') return _spark_testing_client
def as_spark_type(tpe: Union[str, type, Dtype], *, raise_error: bool = True, prefer_timestamp_ntz: bool = False) -> types.DataType: """ Given a Python type, returns the equivalent spark type. Accepts: - the built-in types in Python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - Python3's typing system """ # For NumPy typing, NumPy version should be 1.21+ and Python version should be 3.8+ if sys.version_info >= (3, 8) and LooseVersion( np.__version__) >= LooseVersion("1.21"): if (hasattr(tpe, "__origin__") and tpe.__origin__ is np.ndarray # type: ignore[union-attr] and hasattr(tpe, "__args__") and len(tpe.__args__) > 1 # type: ignore[union-attr] ): # numpy.typing.NDArray return types.ArrayType( as_spark_type( tpe.__args__[1].__args__[0], raise_error=raise_error # type: ignore[union-attr] )) if isinstance(tpe, np.dtype) and tpe == np.dtype("object"): pass # ArrayType elif tpe in (np.ndarray, ): return types.ArrayType(types.StringType()) elif hasattr(tpe, "__origin__") and issubclass( tpe.__origin__, list # type: ignore[union-attr] ): element_type = as_spark_type( tpe.__args__[0], raise_error=raise_error # type: ignore[union-attr] ) if element_type is None: return None return types.ArrayType(element_type) # BinaryType elif tpe in (bytes, np.character, np.bytes_, np.string_): return types.BinaryType() # BooleanType elif tpe in (bool, np.bool_, "bool", "?"): return types.BooleanType() # DateType elif tpe in (datetime.date, ): return types.DateType() # NumericType elif tpe in (np.int8, np.byte, "int8", "byte", "b"): return types.ByteType() elif tpe in (decimal.Decimal, ): # TODO: considering about the precision & scale for decimal type. return types.DecimalType(38, 18) elif tpe in (float, np.float_, np.float64, "float", "float64", "double"): return types.DoubleType() elif tpe in (np.float32, "float32", "f"): return types.FloatType() elif tpe in (np.int32, "int32", "i"): return types.IntegerType() elif tpe in (int, np.int64, "int", "int64", "long"): return types.LongType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() # StringType elif tpe in (str, np.unicode_, "str", "U"): return types.StringType() # TimestampType or TimestampNTZType if timezone is not specified. elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"): return types.TimestampNTZType( ) if prefer_timestamp_ntz else types.TimestampType() # categorical types elif isinstance(tpe, CategoricalDtype) or (isinstance(tpe, str) and type == "category"): return types.LongType() # extension types elif extension_dtypes_available: # IntegralType if isinstance(tpe, Int8Dtype) or (isinstance(tpe, str) and tpe == "Int8"): return types.ByteType() elif isinstance(tpe, Int16Dtype) or (isinstance(tpe, str) and tpe == "Int16"): return types.ShortType() elif isinstance(tpe, Int32Dtype) or (isinstance(tpe, str) and tpe == "Int32"): return types.IntegerType() elif isinstance(tpe, Int64Dtype) or (isinstance(tpe, str) and tpe == "Int64"): return types.LongType() if extension_object_dtypes_available: # BooleanType if isinstance(tpe, BooleanDtype) or (isinstance(tpe, str) and tpe == "boolean"): return types.BooleanType() # StringType elif isinstance(tpe, StringDtype) or (isinstance(tpe, str) and tpe == "string"): return types.StringType() if extension_float_dtypes_available: # FractionalType if isinstance(tpe, Float32Dtype) or (isinstance(tpe, str) and tpe == "Float32"): return types.FloatType() elif isinstance(tpe, Float64Dtype) or (isinstance(tpe, str) and tpe == "Float64"): return types.DoubleType() if raise_error: raise TypeError("Type %s was not understood." % tpe) else: return None
def process_log_data(spark, input_data, output_data): """ Perform ETL steps on "log-data" JSON Files Args: spark: the SparkSession currently in use by the "main()" program; input_data (string): path to where the data to ingest is located; output_data (string): path to where the processed data will be saved; """ # standard schema for "Song" JSON files is set below logsJsonSchema = sqlTypes.StructType([ sqlTypes.StructField('artist',sqlTypes.StringType()) ,sqlTypes.StructField('auth',sqlTypes.StringType()) ,sqlTypes.StructField('firstName',sqlTypes.StringType()) ,sqlTypes.StructField('gender',sqlTypes.StringType()) ,sqlTypes.StructField('itemInSession',sqlTypes.IntegerType()) ,sqlTypes.StructField('lastName',sqlTypes.StringType()) ,sqlTypes.StructField('length',sqlTypes.FloatType()) ,sqlTypes.StructField('level',sqlTypes.StringType()) ,sqlTypes.StructField('location',sqlTypes.StringType()) ,sqlTypes.StructField('method',sqlTypes.StringType()) ,sqlTypes.StructField('page',sqlTypes.StringType()) ,sqlTypes.StructField('registration',sqlTypes.FloatType()) ,sqlTypes.StructField('sessionId',sqlTypes.IntegerType()) ,sqlTypes.StructField('song',sqlTypes.StringType()) ,sqlTypes.StructField('status',sqlTypes.ByteType()) ,sqlTypes.StructField('ts',sqlTypes.LongType()) ,sqlTypes.StructField('userAgent',sqlTypes.StringType()) ,sqlTypes.StructField('userId',sqlTypes.StringType()) ]) # UDF to create timestamp column from original unix epoch column @sqlFunctions.udf(sqlTypes.TimestampType()) def epoch_to_timestamp(unix_epoch): """Convert Unix Epoch values into "human-readable" timestamp format. Args: unix_epoch (int): Unix Epoch value to convert. Returns: timestamp (timestamp): Unix Epoch value conversion result. """ # Unix-Epoch values are converted to human-readable timestamps try: timestamp = datetime.fromtimestamp(unix_epoch / 1000) # NULL values handling happens here except Exception: return None return timestamp """------------------------------------------------------------------------ Read "log-data" files from S3 Bucket ------------------------------------------------------------------------""" # get filepath to log data file log_data = input_data+'log_data/*/*/*.json' # <<-- S3 MULTIPLE FILES PATH # Read "log-data" JSON Files logTime, startUnixEpoch, processName = getCurrentTime(processName='log_data JSON ingestion') print(f'{logTime} UTC: {processName} execution started.') df_logs = spark.read.json( path=log_data ,schema=logsJsonSchema ) logTime, completionUnixEpoch = getCurrentTime() print(f'{logTime} UTC: {processName} took {completionUnixEpoch - startUnixEpoch} ms to execute.') # The "ts" column is converted into human-readable timestamp; df_logs = df_logs.withColumn('event_timestamp',epoch_to_timestamp(df_logs['ts'])) """---------------------------------------------------------------------------- create DIM_USERS dimension ----------------------------------------------------------------------------""" """---------------------------------------------------------------------------- NOTES: the "dim_users" table ETL follows the steps below: 1. Select columns from the "log-data" DataFrame, along with the "ts" column (already transformed into "event_timestamp"); 2. Use a Window Function to chronologically order each user's logged event row and bring the "level" option from the immediately preceding logged event to the current log row being evaluated. When a "previous" event is not available for the user, it means the current event being evaluated is the user's first ever logged event. In this case, the previous "level" attribute defaults to 'userFirstEvent'; 3. Filter records to keep only rows where a user's current subscription "level" is different from its previous subscription "level". 4. Apply another Window Function on this filtered dataset. For each listed user, this Window Function will order the filtered events chronologically and fetch to the current evaluated row the "event timestamp" of the next event row. When a "next row" is not available for a given user, the Window Function will then default to a '9999-12-31 23:59:59' timestamp, indicating this is the user's current "level" option. ----------------------------------------------------------------------------""" # SET WINDOW FUNCTION SPECIFICATIONS FOR CHANGE DATA CAPTURE # user subscription "level" option changes tracking levelChangeWindowSpec = Window \ .partitionBy(sqlFunctions.col('user_id')) \ .orderBy(sqlFunctions.col('event_timestamp')) userPreviousLevelOption = sqlFunctions.lag(sqlFunctions.col('level'),1,'userFirstEvent').over(levelChangeWindowSpec) # user subscription "level" validity timespan levelValidUntilWindowSpec = Window \ .partitionBy(sqlFunctions.col('user_id')) \ .orderBy(sqlFunctions.col('event_timestamp')) # expression to calculate "subscription_level_valid_until" column userLevelValidUntilExpression = sqlFunctions.lead( sqlFunctions.col('event_timestamp') ,1 ,'9999-12-31 23:59:59' ).over(levelValidUntilWindowSpec) dim_users_etl = df_logs.select( sqlFunctions.col('userId').alias('user_id') ,sqlFunctions.col('firstName').alias('first_name') ,sqlFunctions.col('lastName').alias('last_name') ,sqlFunctions.col('gender') ,sqlFunctions.col('level') ,sqlFunctions.col('event_timestamp') ) \ .where("user_id IS NOT NULL AND user_id <> ''") \ .withColumn('previous_event_subscription_level',userPreviousLevelOption) \ .where("level <> previous_event_subscription_level") \ .withColumn('subscription_level_valid_until',userLevelValidUntilExpression) \ .withColumn( 'is_current_user_level' ,sqlFunctions.when( sqlFunctions.col('subscription_level_valid_until') == '9999-12-31 23:59:59' ,True) \ .otherwise(False)) # Select final columns for "dim_users" table dim_users_df = dim_users_etl.select( sqlFunctions.col('user_id') ,sqlFunctions.col('first_name') ,sqlFunctions.col('last_name') ,sqlFunctions.col('gender') ,sqlFunctions.col('level').alias('subscription_level') ,sqlFunctions.col('event_timestamp').alias('subscription_level_valid_since') ,sqlFunctions.col('subscription_level_valid_until') ,sqlFunctions.col('is_current_user_level') ) # Write "dim_users" DataFrame to Parquet files logTime, startUnixEpoch, processName = getCurrentTime(processName='dim_users Parquet write') print(f'{logTime} UTC: {processName} execution started.') dim_users_df.write.parquet( path=output_data+'dim_users' ,mode='overwrite' ) logTime, completionUnixEpoch = getCurrentTime() print(f'{logTime} UTC: {processName} took {completionUnixEpoch - startUnixEpoch} ms to execute.') """------------------------------------------------------------------------ create DIM_TIME dimension ------------------------------------------------------------------------""" dim_time_df = df_logs.select( df_logs['event_timestamp'] ,sqlFunctions.hour(df_logs['event_timestamp']).alias('hour') ,sqlFunctions.dayofmonth(df_logs['event_timestamp']).alias('day_of_month') ,sqlFunctions.weekofyear(df_logs['event_timestamp']).alias('week_of_year') ,sqlFunctions.month(df_logs['event_timestamp']).alias('month') ,sqlFunctions.year(df_logs['event_timestamp']).alias('year') ,sqlFunctions.dayofweek(df_logs['event_timestamp']).alias('weekday') ).dropDuplicates() # WRITE PARTITIONED table as per Udacity's requirements logTime, startUnixEpoch, processName = getCurrentTime(processName='partitioned dim_time Parquet write') print(f'{logTime} UTC: {processName} execution started.') dim_time_df.write.parquet( path=output_data+'dim_time' ,mode='overwrite' ,partitionBy=['year','month'] ) logTime, completionUnixEpoch = getCurrentTime() print(f'{logTime} UTC: {processName} took {completionUnixEpoch - startUnixEpoch} ms to execute.') # WRITE NON-PARTITIONED table for better performance logTime, startUnixEpoch, processName = getCurrentTime(processName='non-partitioned dim_time Parquet write') print(f'{logTime} UTC: {processName} execution started.') dim_time_df.write.parquet( path=output_data+'dim_time_non_partitioned' ,mode='overwrite' ) logTime, completionUnixEpoch = getCurrentTime() print(f'{logTime} UTC: {processName} took {completionUnixEpoch - startUnixEpoch} ms to execute.') """------------------------------------------------------------------------ create FACT_SONGPLAYS fact table ------------------------------------------------------------------------""" # "dim_songs" pre processed table is read back to memory, thus saving IO and # avoiding a repetition of ETL steps already performed. logTime, startUnixEpoch, processName = getCurrentTime(processName='dim_songs Parquet ingestion') print(f'{logTime} UTC: {processName} execution started.') dim_songs = spark.read.parquet(output_data+'dim_songs_non_partitioned') logTime, completionUnixEpoch = getCurrentTime() print(f'{logTime} UTC: {processName} took {completionUnixEpoch - startUnixEpoch} ms to execute.') # "dim_artists" pre processed table is read back to memory, thus saving IO and # avoiding a repetition of ETL steps already performed. logTime, startUnixEpoch, processName = getCurrentTime(processName='dim_artists Parquet ingestion') print(f'{logTime} UTC: {processName} execution started.') dim_artists = spark.read.parquet(output_data+'dim_artists') logTime, completionUnixEpoch = getCurrentTime() print(f'{logTime} UTC: {processName} took {completionUnixEpoch - startUnixEpoch} ms to execute.') # "log_data" columns for the "fact_songplays" table are selected below # Add a "songplay_id" column by using a SQL function # 'NextSong' pages indicate a songplay event, so only these pages are kept. log_events = df_logs.select( sqlFunctions.col('event_timestamp') ,sqlFunctions.col('sessionId').alias('session_id') ,sqlFunctions.col('userId').alias('user_id') ,sqlFunctions.col('level') ,sqlFunctions.col('song') ,sqlFunctions.col('length') ,sqlFunctions.col('artist') ,sqlFunctions.col('location') ,sqlFunctions.col('userAgent').alias('user_agent') ) \ .where("page = 'NextSong' AND user_id IS NOT NULL") \ .withColumn('songplay_id',sqlFunctions.monotonically_increasing_id()) # joins with multiple conditions must be passed as a list log_songs_join_conditions = [log_events['song'] == dim_songs['title'] , log_events['length'] == dim_songs['duration']] # the "log_events" DataFrame is joined to both "songs" and "artists" Dimensions # to perform a lookup of needed attributes present in them. fact_songplays_df = log_events \ .join( other=dim_songs ,on=log_songs_join_conditions ,how='inner' ) \ .join( other=dim_artists ,on=log_events['artist'] == dim_artists['artist_name'] ,how='inner' ) \ .select( log_events['songplay_id'] ,log_events['event_timestamp'] ,log_events['user_id'] ,log_events['level'] ,dim_songs['song_id'] ,dim_artists['artist_id'] ,log_events['session_id'] ,log_events['location'] ,log_events['user_agent'] ,sqlFunctions.year(log_events['event_timestamp']).alias('year') ,sqlFunctions.month(log_events['event_timestamp']).alias('month') ) # WRITE PARTITIONED table as per Udacity's requirements logTime, startUnixEpoch, processName = getCurrentTime(processName='partitioned fact_songplays Parquet write') print(f'{logTime} UTC: {processName} execution started.') fact_songplays_df.write.parquet( path=output_data+'fact_songplays' ,mode='overwrite' ,partitionBy=['year','month'] ) logTime, completionUnixEpoch = getCurrentTime() print(f'{logTime} UTC: {processName} took {completionUnixEpoch - startUnixEpoch} ms to execute.') # WRITE NON-PARTITIONED table for better performance logTime, startUnixEpoch, processName = getCurrentTime(processName='non-partitioned fact_songplays Parquet write') print(f'{logTime} UTC: {processName} execution started.') fact_songplays_df.write.parquet( path=output_data+'fact_songplays_non_partitioned' ,mode='overwrite' ) logTime, completionUnixEpoch = getCurrentTime() print(f'{logTime} UTC: {processName} took {completionUnixEpoch - startUnixEpoch} ms to execute.')
parameters = getattr(tuple_type, "__tuple_params__") else: parameters = getattr(tuple_type, "__args__") return _DataFrame([as_spark_type(t) for t in parameters]) inner = as_spark_type(tpe) if inner is None: return _Unknown(tpe) else: return _Scalar(inner) # First element of the list is the python base type _base = { types.StringType(): [str, "str", "string"], types.BinaryType(): [bytes], types.ByteType(): [np.int8, "int8", "byte"], types.ShortType(): [np.int16, "int16", "short"], types.IntegerType(): [int, "int", np.int, np.int32], types.LongType(): [np.int64, "int64", "long", "bigint"], types.FloatType(): [float, "float", np.float], types.DoubleType(): [np.float64, "float64", "double"], types.TimestampType(): [datetime.datetime, np.datetime64], types.DateType(): [datetime.date], types.BooleanType(): [bool, "boolean", "bool", np.bool], types.ArrayType(types.StringType()): [], } def _build_type_dict(): return dict([(other_type, spark_type) for (spark_type, l) in _base.items() for other_type in l] + [(spark_type, spark_type)
from pyspark.sql import types # base type DType = types.DataType # individual types String = types.StringType() Date = types.DateType() Datetime = types.TimestampType() # numeric types Float = types.FloatType() Double = types.DoubleType() Byte = types.ByteType() Short = types.ShortType() Integer = types.IntegerType() Long = types.LongType() # groups Floats = (Float, Double) Integers = (Byte, Short, Integer, Long) Numerics = Floats + Integers
def get_common_spark_testing_client(data_directory, connect): spark = ( SparkSession.builder.appName("ibis_testing").master("local[1]").config( "spark.cores.max", 1).config("spark.executor.heartbeatInterval", "3600s").config( "spark.executor.instances", 1).config("spark.network.timeout", "4200s").config( "spark.sql.execution.arrow.pyspark.enabled", False).config("spark.sql.legacy.timeParserPolicy", "LEGACY"). config("spark.storage.blockManagerSlaveTimeoutMs", "4200s").config( "spark.ui.showConsoleProgress", False).config('spark.default.parallelism', 1).config( 'spark.dynamicAllocation.enabled', False).config('spark.rdd.compress', False).config( 'spark.serializer', 'org.apache.spark.serializer.KryoSerializer').config( 'spark.shuffle.compress', False).config( 'spark.shuffle.spill.compress', False).config('spark.sql.shuffle.partitions', 1).config('spark.ui.enabled', False).getOrCreate()) _spark_testing_client = connect(spark) s = _spark_testing_client._session num_partitions = 4 df_functional_alltypes = ( s.read.csv( path=str(data_directory / 'functional_alltypes.csv'), schema=pt.StructType([ pt.StructField('index', pt.IntegerType(), True), pt.StructField('Unnamed: 0', pt.IntegerType(), True), pt.StructField('id', pt.IntegerType(), True), # cast below, Spark can't read 0/1 as bool pt.StructField('bool_col', pt.ByteType(), True), pt.StructField('tinyint_col', pt.ByteType(), True), pt.StructField('smallint_col', pt.ShortType(), True), pt.StructField('int_col', pt.IntegerType(), True), pt.StructField('bigint_col', pt.LongType(), True), pt.StructField('float_col', pt.FloatType(), True), pt.StructField('double_col', pt.DoubleType(), True), pt.StructField('date_string_col', pt.StringType(), True), pt.StructField('string_col', pt.StringType(), True), pt.StructField('timestamp_col', pt.TimestampType(), True), pt.StructField('year', pt.IntegerType(), True), pt.StructField('month', pt.IntegerType(), True), ]), mode='FAILFAST', header=True, ).repartition(num_partitions).sort('index')) df_functional_alltypes = df_functional_alltypes.withColumn( "bool_col", df_functional_alltypes["bool_col"].cast("boolean")) df_functional_alltypes.createOrReplaceTempView('functional_alltypes') df_batting = (s.read.csv( path=str(data_directory / 'batting.csv'), schema=pt.StructType([ pt.StructField('playerID', pt.StringType(), True), pt.StructField('yearID', pt.IntegerType(), True), pt.StructField('stint', pt.IntegerType(), True), pt.StructField('teamID', pt.StringType(), True), pt.StructField('lgID', pt.StringType(), True), pt.StructField('G', pt.IntegerType(), True), pt.StructField('AB', pt.DoubleType(), True), pt.StructField('R', pt.DoubleType(), True), pt.StructField('H', pt.DoubleType(), True), pt.StructField('X2B', pt.DoubleType(), True), pt.StructField('X3B', pt.DoubleType(), True), pt.StructField('HR', pt.DoubleType(), True), pt.StructField('RBI', pt.DoubleType(), True), pt.StructField('SB', pt.DoubleType(), True), pt.StructField('CS', pt.DoubleType(), True), pt.StructField('BB', pt.DoubleType(), True), pt.StructField('SO', pt.DoubleType(), True), pt.StructField('IBB', pt.DoubleType(), True), pt.StructField('HBP', pt.DoubleType(), True), pt.StructField('SH', pt.DoubleType(), True), pt.StructField('SF', pt.DoubleType(), True), pt.StructField('GIDP', pt.DoubleType(), True), ]), header=True, ).repartition(num_partitions).sort('playerID')) df_batting.createOrReplaceTempView("batting") df_awards_players = (s.read.csv( path=str(data_directory / 'awards_players.csv'), schema=pt.StructType([ pt.StructField('playerID', pt.StringType(), True), pt.StructField('awardID', pt.StringType(), True), pt.StructField('yearID', pt.IntegerType(), True), pt.StructField('lgID', pt.StringType(), True), pt.StructField('tie', pt.StringType(), True), pt.StructField('notes', pt.StringType(), True), ]), header=True, ).repartition(num_partitions).sort('playerID')) df_awards_players.createOrReplaceTempView('awards_players') df_simple = s.createDataFrame([(1, 'a')], ['foo', 'bar']) df_simple.createOrReplaceTempView('simple') df_struct = s.createDataFrame([((1, 2, 'a'), )], ['struct_col']) df_struct.createOrReplaceTempView('struct') df_nested_types = s.createDataFrame( [([1, 2], [[3, 4], [5, 6]], { 'a': [[2, 4], [3, 5]] })], [ 'list_of_ints', 'list_of_list_of_ints', 'map_string_list_of_list_of_ints', ], ) df_nested_types.createOrReplaceTempView('nested_types') df_complicated = s.createDataFrame([({ (1, 3): [[2, 4], [3, 5]] }, )], ['map_tuple_list_of_list_of_ints']) df_complicated.createOrReplaceTempView('complicated') df_udf = s.createDataFrame( [('a', 1, 4.0, 'a'), ('b', 2, 5.0, 'a'), ('c', 3, 6.0, 'b')], ['a', 'b', 'c', 'key'], ) df_udf.createOrReplaceTempView('udf') df_udf_nan = s.createDataFrame( pd.DataFrame({ 'a': np.arange(10, dtype=float), 'b': [3.0, np.NaN] * 5, 'key': list('ddeefffggh'), })) df_udf_nan.createOrReplaceTempView('udf_nan') df_udf_null = s.createDataFrame( [(float(i), None if i % 2 else 3.0, 'ddeefffggh'[i]) for i in range(10)], ['a', 'b', 'key'], ) df_udf_null.createOrReplaceTempView('udf_null') df_udf_random = s.createDataFrame( pd.DataFrame({ 'a': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), 'b': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), 'key': list('ddeefff'), })) df_udf_random.createOrReplaceTempView('udf_random') return _spark_testing_client
def spark_client_testing(data_directory): pytest.importorskip('pyspark') import pyspark.sql.types as pt client = ibis.spark.connect() df_functional_alltypes = client._session.read.csv( path=str(data_directory / 'functional_alltypes.csv'), schema=pt.StructType([ pt.StructField('index', pt.IntegerType(), True), pt.StructField('Unnamed: 0', pt.IntegerType(), True), pt.StructField('id', pt.IntegerType(), True), # cast below, Spark can't read 0/1 as bool pt.StructField('bool_col', pt.ByteType(), True), pt.StructField('tinyint_col', pt.ByteType(), True), pt.StructField('smallint_col', pt.ShortType(), True), pt.StructField('int_col', pt.IntegerType(), True), pt.StructField('bigint_col', pt.LongType(), True), pt.StructField('float_col', pt.FloatType(), True), pt.StructField('double_col', pt.DoubleType(), True), pt.StructField('date_string_col', pt.StringType(), True), pt.StructField('string_col', pt.StringType(), True), pt.StructField('timestamp_col', pt.TimestampType(), True), pt.StructField('year', pt.IntegerType(), True), pt.StructField('month', pt.IntegerType(), True), ]), mode='FAILFAST', header=True, ) df_functional_alltypes = df_functional_alltypes.withColumn( "bool_col", df_functional_alltypes["bool_col"].cast("boolean")) df_functional_alltypes.createOrReplaceTempView('functional_alltypes') df_batting = client._session.read.csv( path=str(data_directory / 'batting.csv'), schema=pt.StructType([ pt.StructField('playerID', pt.StringType(), True), pt.StructField('yearID', pt.IntegerType(), True), pt.StructField('stint', pt.IntegerType(), True), pt.StructField('teamID', pt.StringType(), True), pt.StructField('lgID', pt.StringType(), True), pt.StructField('G', pt.IntegerType(), True), pt.StructField('AB', pt.DoubleType(), True), pt.StructField('R', pt.DoubleType(), True), pt.StructField('H', pt.DoubleType(), True), pt.StructField('X2B', pt.DoubleType(), True), pt.StructField('X3B', pt.DoubleType(), True), pt.StructField('HR', pt.DoubleType(), True), pt.StructField('RBI', pt.DoubleType(), True), pt.StructField('SB', pt.DoubleType(), True), pt.StructField('CS', pt.DoubleType(), True), pt.StructField('BB', pt.DoubleType(), True), pt.StructField('SO', pt.DoubleType(), True), pt.StructField('IBB', pt.DoubleType(), True), pt.StructField('HBP', pt.DoubleType(), True), pt.StructField('SH', pt.DoubleType(), True), pt.StructField('SF', pt.DoubleType(), True), pt.StructField('GIDP', pt.DoubleType(), True), ]), header=True, ) df_batting.createOrReplaceTempView('batting') df_awards_players = client._session.read.csv( path=str(data_directory / 'awards_players.csv'), schema=pt.StructType([ pt.StructField('playerID', pt.StringType(), True), pt.StructField('awardID', pt.StringType(), True), pt.StructField('yearID', pt.IntegerType(), True), pt.StructField('lgID', pt.StringType(), True), pt.StructField('tie', pt.StringType(), True), pt.StructField('notes', pt.StringType(), True), ]), header=True, ) df_awards_players.createOrReplaceTempView('awards_players') df_simple = client._session.createDataFrame([(1, 'a')], ['foo', 'bar']) df_simple.createOrReplaceTempView('simple') df_struct = client._session.createDataFrame([((1, 2, 'a'), )], ['struct_col']) df_struct.createOrReplaceTempView('struct') df_nested_types = client._session.createDataFrame([( [1, 2], [[3, 4], [5, 6]], { 'a': [[2, 4], [3, 5]] }, )], [ 'list_of_ints', 'list_of_list_of_ints', 'map_string_list_of_list_of_ints' ]) df_nested_types.createOrReplaceTempView('nested_types') df_complicated = client._session.createDataFrame([({ (1, 3): [[2, 4], [3, 5]] }, )], ['map_tuple_list_of_list_of_ints']) df_complicated.createOrReplaceTempView('complicated') return client
PRIMITIVES = [ (NoneType, t.NullType()), (int, t.IntegerType()), (float, t.DoubleType()), (str, t.StringType()), (bytes, t.BinaryType()), (bytearray, t.BinaryType()), (bool, t.BooleanType()), ] SYNTHETIC_PRIMITIVES = [ (long, t.LongType()), (short, t.ShortType()), (byte, t.ByteType()), ] DATE_TYPES = [ (date, t.DateType()), (datetime, t.TimestampType()), ] OPTIONALS_PY = [ (str, (False, str)), (decimal(1, 2), (False, decimal(1, 2))), (Optional[decimal(3, 4)], (True, decimal(3, 4))), (List[str], (False, List[str])), (Optional[str], (True, str)), (Optional[List[str]], (True, List[str])), ]
def _to_stype(tpe) -> X: if _is_col(tpe): inner = as_spark_type(_get_col_inner(tpe)) return _Column(inner) inner = as_spark_type(tpe) if inner is None: return _Unknown(tpe) else: return _Regular(inner) # First element of the list is the python base type _base = { types.StringType(): [str, 'str', 'string'], types.ByteType(): [np.int8, 'int8', 'byte'], types.ShortType(): [np.int16, 'int16', 'short'], types.IntegerType(): [int, 'int', np.int], types.LongType(): [np.int64, 'int64', 'long', 'bigint'], types.FloatType(): [float, 'float', np.float], types.DoubleType(): [np.float64, 'float64', 'double'], types.TimestampType(): [np.datetime64], types.BooleanType(): [bool, 'boolean', 'bool', np.bool], } def _build_type_dict(): return dict([(other_type, spark_type) for (spark_type, l) in _base.items() for other_type in l])
def recommend(): json_data = request.get_json() user_ratings = json_data['user_ratings'] keyword = str(json_data['keyword']).lower() location = str(json_data['location']).lower() user_ratings_data = list(user_ratings.items()) # Define schema schema = T.StructType([ T.StructField('item', T.StringType(), True), T.StructField('rating', T.StringType(), True) ]) user_ratings_df = spark.createDataFrame(user_ratings_data, schema=schema) user_ratings_df = (user_ratings_df.select( F.col('item').cast(T.IntegerType()).alias('item'), F.col('rating').cast(T.ByteType()).alias('rating'))) # user_ratings_df.printSchema() # print(user_ratings_df.head(10)) predicted_rating_df = make_new_user_predictions(user_ratings_df) prediction_data_df = (predicted_rating_df.join( restaurants_with_id_df, on='item' ).filter( (F.lower(F.col('name')).like('%{}%'.format(keyword)) | find_str_in_categories_udf(F.col('categories'), F.lit(keyword))) & (F.lower(F.col('location.city')).like('%{}%'.format(location)) | F.lower(F.col('location.address1')).like('%{}%'.format(location)) | F.lower(F.col('location.address2')).like('%{}%'.format(location)) | F.lower(F.col('location.address3')).like('%{}%'.format(location)) | F.lower(F.col('location.zip_code')).like('%{}%'.format(location)) | F.lower(F.col('location.state')).like('%{}%'.format(location))) ).join(user_ratings_df.select(F.col('item'), F.col('rating').alias('user_rating')), on='item', how='left_outer').filter(F.isnull('user_rating')).sort( F.col('prediction'), ascending=False)) # prediction_data_df.printSchema() # print(prediction_data_df.show(20, truncate=False)) results = {} for i, row in enumerate(prediction_data_df.take(10)): results[i] = { 'model_id': row['item'], 'prediction': row['prediction'], 'name': row['name'], 'url': row['url'], 'image_url': row['image_url'], 'location': row['location'], 'rating': row['rating'], 'count_item_rating': row['count_item_rating'], 'item_bias': row['item_bias'], 'res_prediction': row['res_prediction'], 'categories': row['categories'] } return jsonify(results)
def as_spark_type(tpe: typing.Union[str, type, Dtype], *, raise_error: bool = True) -> types.DataType: """ Given a Python type, returns the equivalent spark type. Accepts: - the built-in types in Python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - Python3's typing system """ # TODO: Add "boolean" and "string" types. # ArrayType if tpe in (np.ndarray, ): return types.ArrayType(types.StringType()) elif hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, list): # type: ignore element_type = as_spark_type(tpe.__args__[0], raise_error=raise_error) # type: ignore if element_type is None: return None return types.ArrayType(element_type) # BinaryType elif tpe in (bytes, np.character, np.bytes_, np.string_): return types.BinaryType() # BooleanType elif tpe in (bool, np.bool, "bool", "?"): return types.BooleanType() # DateType elif tpe in (datetime.date, ): return types.DateType() # NumericType elif tpe in (np.int8, np.byte, "int8", "byte", "b"): return types.ByteType() elif tpe in (decimal.Decimal, ): # TODO: considering about the precision & scale for decimal type. return types.DecimalType(38, 18) elif tpe in (float, np.float, np.float64, "float", "float64", "double"): return types.DoubleType() elif tpe in (np.float32, "float32", "f"): return types.FloatType() elif tpe in (np.int32, "int32", "i"): return types.IntegerType() elif tpe in (int, np.int, np.int64, "int", "int64", "long"): return types.LongType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() # StringType elif tpe in (str, np.unicode_, "str", "U"): return types.StringType() # TimestampType elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"): return types.TimestampType() # categorical types elif isinstance(tpe, CategoricalDtype) or (isinstance(tpe, str) and type == "category"): return types.LongType() # extension types elif extension_dtypes_available: # IntegralType if isinstance(tpe, Int8Dtype) or (isinstance(tpe, str) and tpe == "Int8"): return types.ByteType() elif isinstance(tpe, Int16Dtype) or (isinstance(tpe, str) and tpe == "Int16"): return types.ShortType() elif isinstance(tpe, Int32Dtype) or (isinstance(tpe, str) and tpe == "Int32"): return types.IntegerType() elif isinstance(tpe, Int64Dtype) or (isinstance(tpe, str) and tpe == "Int64"): return types.LongType() if extension_object_dtypes_available: # BooleanType if isinstance(tpe, BooleanDtype) or (isinstance(tpe, str) and tpe == "boolean"): return types.BooleanType() # StringType elif isinstance(tpe, StringDtype) or (isinstance(tpe, str) and tpe == "string"): return types.StringType() if extension_float_dtypes_available: # FractionalType if isinstance(tpe, Float32Dtype) or (isinstance(tpe, str) and tpe == "Float32"): return types.FloatType() elif isinstance(tpe, Float64Dtype) or (isinstance(tpe, str) and tpe == "Float64"): return types.DoubleType() if raise_error: raise TypeError("Type %s was not understood." % tpe) else: return None
def test_datatype_with_structs_arrays_and_maps(self): first = T.StructType([ T.StructField('f1', T.BooleanType()), T.StructField( 'f2', T.StructType([ T.StructField( 's1', T.ArrayType(T.StructType([ T.StructField('ss1', T.BooleanType()), T.StructField('ss2', T.ByteType()) ])), ), ]), ), T.StructField('f3', T.ArrayType(T.MapType(T.StringType(), T.IntegerType()))), T.StructField( 'f4', T.MapType( T.StringType(), T.StructType([ T.StructField('ss1', T.IntegerType()), T.StructField('ss2', T.LongType()), ])), ), ]) second = T.StructType([ T.StructField('f3', T.ArrayType(T.MapType(T.StringType(), T.IntegerType()))), T.StructField( 'f2', T.StructType([ T.StructField( 's1', T.ArrayType(T.StructType([ T.StructField('ss2', T.ByteType()), T.StructField('ss1', T.BooleanType()), ])), ), ]), ), T.StructField( 'f4', T.MapType( T.StringType(), T.StructType([ T.StructField('ss2', T.LongType()), T.StructField('ss1', T.IntegerType()), ])), ), T.StructField('f1', T.BooleanType()), ]) SparklyTest().assertRowsEqual(first, second, ignore_order=True) with self.assertRaises(AssertionError): self.assertEqual(first, second) # change entry (f4.ss1, T.LongType) second = T.StructType([ T.StructField('f3', T.ArrayType(T.MapType(T.StringType(), T.IntegerType()))), T.StructField( 'f2', T.StructType([ T.StructField( 's1', T.ArrayType(T.StructType([ T.StructField('ss2', T.ByteType()), T.StructField('ss1', T.BooleanType()), ])), ), ]), ), T.StructField( 'f4', T.MapType( T.StringType(), T.StructType([ T.StructField('ss2', T.LongType()), T.StructField('ss1', T.LongType()), ])), ), T.StructField('f1', T.BooleanType()), ]) with self.assertRaises(AssertionError): SparklyTest().assertRowsEqual(first, second, ignore_order=True)