Пример #1
0
    def test_datatype(self):
        first = T.StructType([
            T.StructField('f1', T.BooleanType()),
            T.StructField('f2', T.ByteType()),
            T.StructField('f3', T.IntegerType()),
            T.StructField('f4', T.LongType()),
        ])
        second = T.StructType([
            T.StructField('f3', T.IntegerType()),
            T.StructField('f2', T.ByteType()),
            T.StructField('f4', T.LongType()),
            T.StructField('f1', T.BooleanType()),
        ])

        SparklyTest().assertRowsEqual(first, second, ignore_order=True)
        with self.assertRaises(AssertionError):
            self.assertEqual(first, second)

        # change entry (f4, T.LongType)
        second = T.StructType([
            T.StructField('f3', T.IntegerType()),
            T.StructField('f2', T.ByteType()),
            T.StructField('f4', T.StringType()),
            T.StructField('f1', T.BooleanType()),
        ])

        with self.assertRaises(AssertionError):
            SparklyTest().assertRowsEqual(first, second, ignore_order=True)
Пример #2
0
def _numpy_to_spark_mapping():
    """Returns a mapping from numpy to pyspark.sql type. Caches the mapping dictionary inorder to avoid instantiation
    of multiple objects in each call."""

    # Refer to the attribute of the function we use to cache the map using a name in the variable instead of a 'dot'
    # notation to avoid copy/paste/typo mistakes
    cache_attr_name = 'cached_numpy_to_pyspark_types_map'
    if not hasattr(_numpy_to_spark_mapping, cache_attr_name):
        import pyspark.sql.types as T

        setattr(_numpy_to_spark_mapping, cache_attr_name,
                {
                    np.int8: T.ByteType(),
                    np.uint8: T.ShortType(),
                    np.int16: T.ShortType(),
                    np.uint16: T.IntegerType(),
                    np.int32: T.IntegerType(),
                    np.int64: T.LongType(),
                    np.float32: T.FloatType(),
                    np.float64: T.DoubleType(),
                    np.string_: T.StringType(),
                    np.str_: T.StringType(),
                    np.unicode_: T.StringType(),
                    np.bool_: T.BooleanType(),
                })

    return getattr(_numpy_to_spark_mapping, cache_attr_name)
Пример #3
0
def as_spark_type(tpe) -> types.DataType:
    """
    Given a python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - python3's typing system
    """
    if tpe in (str, "str", "string"):
        return types.StringType()
    elif tpe in (bytes, ):
        return types.BinaryType()
    elif tpe in (np.int8, "int8", "byte"):
        return types.ByteType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    elif tpe in (int, "int", np.int, np.int32):
        return types.IntegerType()
    elif tpe in (np.int64, "int64", "long", "bigint"):
        return types.LongType()
    elif tpe in (float, "float", np.float):
        return types.FloatType()
    elif tpe in (np.float64, "float64", "double"):
        return types.DoubleType()
    elif tpe in (datetime.datetime, np.datetime64):
        return types.TimestampType()
    elif tpe in (datetime.date, ):
        return types.DateType()
    elif tpe in (bool, "boolean", "bool", np.bool):
        return types.BooleanType()
    elif tpe in ():
        return types.ArrayType(types.StringType())
Пример #4
0
def infer_spark_type(typeclass) -> t.DataType:
    if typeclass in (None, NoneType):
        return t.NullType()
    elif typeclass is str:
        return t.StringType()
    elif typeclass in {bytes, bytearray}:
        return t.BinaryType()
    elif typeclass is bool:
        return t.BooleanType()
    elif typeclass is date:
        return t.DateType()
    elif typeclass is datetime:
        return t.TimestampType()
    elif typeclass is Decimal:
        return t.DecimalType(precision=36, scale=6)
    elif isinstance(typeclass, type) and issubclass(typeclass, BoundDecimal):
        (precision, scale) = typeclass.__constraints__
        return t.DecimalType(precision=precision, scale=scale)
    elif typeclass is float:
        return t.DoubleType()
    elif typeclass is int:
        return t.IntegerType()
    elif typeclass is long:
        return t.LongType()
    elif typeclass is short:
        return t.ShortType()
    elif typeclass is byte:
        return t.ByteType()
    elif getattr(typeclass, "__origin__", None) is not None:
        return infer_complex_spark_type(typeclass)
    elif is_pyspark_class(typeclass):
        return transform(typeclass)
    else:
        raise TypeError(f"Don't know how to represent {typeclass} in Spark")
Пример #5
0
def as_spark_type(tpe) -> types.DataType:
    """
    Given a Python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in Python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - Python3's typing system
    """
    # TODO: Add "boolean" and "string" types.
    # ArrayType
    if tpe in (np.ndarray,):
        return types.ArrayType(types.StringType())
    elif hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, list):
        return types.ArrayType(as_spark_type(tpe.__args__[0]))
    # BinaryType
    elif tpe in (bytes, np.character, np.bytes_, np.string_):
        return types.BinaryType()
    # BooleanType
    elif tpe in (bool, np.bool, "bool", "?"):
        return types.BooleanType()
    # DateType
    elif tpe in (datetime.date,):
        return types.DateType()
    # NumericType
    elif tpe in (np.int8, np.byte, "int8", "byte", "b"):
        return types.ByteType()
    elif tpe in (decimal.Decimal,):
        # TODO: considering about the precision & scale for decimal type.
        return types.DecimalType(38, 18)
    elif tpe in (float, np.float, np.float64, "float", "float64", "double"):
        return types.DoubleType()
    elif tpe in (np.float32, "float32", "f"):
        return types.FloatType()
    elif tpe in (np.int32, "int32", "i"):
        return types.IntegerType()
    elif tpe in (int, np.int, np.int64, "int", "int64", "long", "bigint"):
        return types.LongType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    # StringType
    elif tpe in (str, np.unicode_, "str", "U"):
        return types.StringType()
    # TimestampType
    elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"):
        return types.TimestampType()
    else:
        raise TypeError("Type %s was not understood." % tpe)
Пример #6
0
def as_spark_type(tpe) -> types.DataType:
    """
    Given a python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - python3's typing system
    """
    if tpe in (str, "str", "string"):
        return types.StringType()
    elif tpe in (bytes, ):
        return types.BinaryType()
    elif tpe in (np.int8, "int8", "byte"):
        return types.ByteType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    elif tpe in (int, "int", np.int, np.int32):
        return types.IntegerType()
    elif tpe in (np.int64, "int64", "long", "bigint"):
        return types.LongType()
    elif tpe in (float, "float", np.float):
        return types.FloatType()
    elif tpe in (np.float64, "float64", "double"):
        return types.DoubleType()
    elif tpe in (decimal.Decimal, ):
        return types.DecimalType(38, 18)
    elif tpe in (datetime.datetime, np.datetime64):
        return types.TimestampType()
    elif tpe in (datetime.date, ):
        return types.DateType()
    elif tpe in (bool, "boolean", "bool", np.bool):
        return types.BooleanType()
    elif tpe in (np.ndarray, ):
        # TODO: support other child types
        return types.ArrayType(types.StringType())
    else:
        raise TypeError("Type %s was not understood." % tpe)
Пример #7
0
def get_common_spark_testing_client(data_directory, connect):
    pytest.importorskip('pyspark')
    import pyspark.sql.types as pt
    from pyspark.sql import SparkSession

    spark = (SparkSession.builder.config('spark.default.parallelism',
                                         4).config('spark.driver.bindAddress',
                                                   '127.0.0.1').getOrCreate())
    _spark_testing_client = connect(spark)
    s = _spark_testing_client._session
    num_partitions = 4

    df_functional_alltypes = (
        s.read.csv(
            path=str(data_directory / 'functional_alltypes.csv'),
            schema=pt.StructType([
                pt.StructField('index', pt.IntegerType(), True),
                pt.StructField('Unnamed: 0', pt.IntegerType(), True),
                pt.StructField('id', pt.IntegerType(), True),
                # cast below, Spark can't read 0/1 as bool
                pt.StructField('bool_col', pt.ByteType(), True),
                pt.StructField('tinyint_col', pt.ByteType(), True),
                pt.StructField('smallint_col', pt.ShortType(), True),
                pt.StructField('int_col', pt.IntegerType(), True),
                pt.StructField('bigint_col', pt.LongType(), True),
                pt.StructField('float_col', pt.FloatType(), True),
                pt.StructField('double_col', pt.DoubleType(), True),
                pt.StructField('date_string_col', pt.StringType(), True),
                pt.StructField('string_col', pt.StringType(), True),
                pt.StructField('timestamp_col', pt.TimestampType(), True),
                pt.StructField('year', pt.IntegerType(), True),
                pt.StructField('month', pt.IntegerType(), True),
            ]),
            mode='FAILFAST',
            header=True,
        ).repartition(num_partitions).sort('index'))

    df_functional_alltypes = df_functional_alltypes.withColumn(
        "bool_col", df_functional_alltypes["bool_col"].cast("boolean"))
    df_functional_alltypes.createOrReplaceTempView('functional_alltypes')

    df_batting = (s.read.csv(
        path=str(data_directory / 'batting.csv'),
        schema=pt.StructType([
            pt.StructField('playerID', pt.StringType(), True),
            pt.StructField('yearID', pt.IntegerType(), True),
            pt.StructField('stint', pt.IntegerType(), True),
            pt.StructField('teamID', pt.StringType(), True),
            pt.StructField('lgID', pt.StringType(), True),
            pt.StructField('G', pt.IntegerType(), True),
            pt.StructField('AB', pt.DoubleType(), True),
            pt.StructField('R', pt.DoubleType(), True),
            pt.StructField('H', pt.DoubleType(), True),
            pt.StructField('X2B', pt.DoubleType(), True),
            pt.StructField('X3B', pt.DoubleType(), True),
            pt.StructField('HR', pt.DoubleType(), True),
            pt.StructField('RBI', pt.DoubleType(), True),
            pt.StructField('SB', pt.DoubleType(), True),
            pt.StructField('CS', pt.DoubleType(), True),
            pt.StructField('BB', pt.DoubleType(), True),
            pt.StructField('SO', pt.DoubleType(), True),
            pt.StructField('IBB', pt.DoubleType(), True),
            pt.StructField('HBP', pt.DoubleType(), True),
            pt.StructField('SH', pt.DoubleType(), True),
            pt.StructField('SF', pt.DoubleType(), True),
            pt.StructField('GIDP', pt.DoubleType(), True),
        ]),
        header=True,
    ).repartition(num_partitions).sort('playerID'))
    df_batting.createOrReplaceTempView('batting')

    df_awards_players = (s.read.csv(
        path=str(data_directory / 'awards_players.csv'),
        schema=pt.StructType([
            pt.StructField('playerID', pt.StringType(), True),
            pt.StructField('awardID', pt.StringType(), True),
            pt.StructField('yearID', pt.IntegerType(), True),
            pt.StructField('lgID', pt.StringType(), True),
            pt.StructField('tie', pt.StringType(), True),
            pt.StructField('notes', pt.StringType(), True),
        ]),
        header=True,
    ).repartition(num_partitions).sort('playerID'))
    df_awards_players.createOrReplaceTempView('awards_players')

    df_simple = s.createDataFrame([(1, 'a')], ['foo', 'bar'])
    df_simple.createOrReplaceTempView('simple')

    df_struct = s.createDataFrame([((1, 2, 'a'), )], ['struct_col'])
    df_struct.createOrReplaceTempView('struct')

    df_nested_types = s.createDataFrame(
        [([1, 2], [[3, 4], [5, 6]], {
            'a': [[2, 4], [3, 5]]
        })],
        [
            'list_of_ints',
            'list_of_list_of_ints',
            'map_string_list_of_list_of_ints',
        ],
    )
    df_nested_types.createOrReplaceTempView('nested_types')

    df_complicated = s.createDataFrame([({
        (1, 3): [[2, 4], [3, 5]]
    }, )], ['map_tuple_list_of_list_of_ints'])
    df_complicated.createOrReplaceTempView('complicated')

    df_udf = s.createDataFrame(
        [('a', 1, 4.0, 'a'), ('b', 2, 5.0, 'a'), ('c', 3, 6.0, 'b')],
        ['a', 'b', 'c', 'key'],
    )
    df_udf.createOrReplaceTempView('udf')

    df_udf_nan = s.createDataFrame(
        pd.DataFrame({
            'a': np.arange(10, dtype=float),
            'b': [3.0, np.NaN] * 5,
            'key': list('ddeefffggh'),
        }))
    df_udf_nan.createOrReplaceTempView('udf_nan')

    df_udf_null = s.createDataFrame(
        [(float(i), None if i % 2 else 3.0, 'ddeefffggh'[i])
         for i in range(10)],
        ['a', 'b', 'key'],
    )
    df_udf_null.createOrReplaceTempView('udf_null')

    df_udf_random = s.createDataFrame(
        pd.DataFrame({
            'a':
            np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(),
            'b':
            np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(),
            'key':
            list('ddeefff'),
        }))
    df_udf_random.createOrReplaceTempView('udf_random')

    return _spark_testing_client
Пример #8
0
def as_spark_type(tpe: Union[str, type, Dtype],
                  *,
                  raise_error: bool = True,
                  prefer_timestamp_ntz: bool = False) -> types.DataType:
    """
    Given a Python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in Python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - Python3's typing system
    """
    # For NumPy typing, NumPy version should be 1.21+ and Python version should be 3.8+
    if sys.version_info >= (3, 8) and LooseVersion(
            np.__version__) >= LooseVersion("1.21"):
        if (hasattr(tpe, "__origin__")
                and tpe.__origin__ is np.ndarray  # type: ignore[union-attr]
                and hasattr(tpe, "__args__")
                and len(tpe.__args__) > 1  # type: ignore[union-attr]
            ):
            # numpy.typing.NDArray
            return types.ArrayType(
                as_spark_type(
                    tpe.__args__[1].__args__[0],
                    raise_error=raise_error  # type: ignore[union-attr]
                ))

    if isinstance(tpe, np.dtype) and tpe == np.dtype("object"):
        pass
    # ArrayType
    elif tpe in (np.ndarray, ):
        return types.ArrayType(types.StringType())
    elif hasattr(tpe, "__origin__") and issubclass(
            tpe.__origin__,
            list  # type: ignore[union-attr]
    ):
        element_type = as_spark_type(
            tpe.__args__[0],
            raise_error=raise_error  # type: ignore[union-attr]
        )
        if element_type is None:
            return None
        return types.ArrayType(element_type)
    # BinaryType
    elif tpe in (bytes, np.character, np.bytes_, np.string_):
        return types.BinaryType()
    # BooleanType
    elif tpe in (bool, np.bool_, "bool", "?"):
        return types.BooleanType()
    # DateType
    elif tpe in (datetime.date, ):
        return types.DateType()
    # NumericType
    elif tpe in (np.int8, np.byte, "int8", "byte", "b"):
        return types.ByteType()
    elif tpe in (decimal.Decimal, ):
        # TODO: considering about the precision & scale for decimal type.
        return types.DecimalType(38, 18)
    elif tpe in (float, np.float_, np.float64, "float", "float64", "double"):
        return types.DoubleType()
    elif tpe in (np.float32, "float32", "f"):
        return types.FloatType()
    elif tpe in (np.int32, "int32", "i"):
        return types.IntegerType()
    elif tpe in (int, np.int64, "int", "int64", "long"):
        return types.LongType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    # StringType
    elif tpe in (str, np.unicode_, "str", "U"):
        return types.StringType()
    # TimestampType or TimestampNTZType if timezone is not specified.
    elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"):
        return types.TimestampNTZType(
        ) if prefer_timestamp_ntz else types.TimestampType()

    # categorical types
    elif isinstance(tpe, CategoricalDtype) or (isinstance(tpe, str)
                                               and type == "category"):
        return types.LongType()

    # extension types
    elif extension_dtypes_available:
        # IntegralType
        if isinstance(tpe, Int8Dtype) or (isinstance(tpe, str)
                                          and tpe == "Int8"):
            return types.ByteType()
        elif isinstance(tpe, Int16Dtype) or (isinstance(tpe, str)
                                             and tpe == "Int16"):
            return types.ShortType()
        elif isinstance(tpe, Int32Dtype) or (isinstance(tpe, str)
                                             and tpe == "Int32"):
            return types.IntegerType()
        elif isinstance(tpe, Int64Dtype) or (isinstance(tpe, str)
                                             and tpe == "Int64"):
            return types.LongType()

        if extension_object_dtypes_available:
            # BooleanType
            if isinstance(tpe, BooleanDtype) or (isinstance(tpe, str)
                                                 and tpe == "boolean"):
                return types.BooleanType()
            # StringType
            elif isinstance(tpe, StringDtype) or (isinstance(tpe, str)
                                                  and tpe == "string"):
                return types.StringType()

        if extension_float_dtypes_available:
            # FractionalType
            if isinstance(tpe, Float32Dtype) or (isinstance(tpe, str)
                                                 and tpe == "Float32"):
                return types.FloatType()
            elif isinstance(tpe, Float64Dtype) or (isinstance(tpe, str)
                                                   and tpe == "Float64"):
                return types.DoubleType()

    if raise_error:
        raise TypeError("Type %s was not understood." % tpe)
    else:
        return None
Пример #9
0
def process_log_data(spark, input_data, output_data):
    """ Perform ETL steps on "log-data" JSON Files

    Args:
        spark: the SparkSession currently in use by the "main()" program;

        input_data (string): path to where the data to ingest is located;

        output_data (string): path to where the processed data will be saved; 
    """
    
    # standard schema for "Song" JSON files is set below
    logsJsonSchema = sqlTypes.StructType([
         sqlTypes.StructField('artist',sqlTypes.StringType())
        ,sqlTypes.StructField('auth',sqlTypes.StringType())
        ,sqlTypes.StructField('firstName',sqlTypes.StringType())
        ,sqlTypes.StructField('gender',sqlTypes.StringType())
        ,sqlTypes.StructField('itemInSession',sqlTypes.IntegerType())
        ,sqlTypes.StructField('lastName',sqlTypes.StringType())
        ,sqlTypes.StructField('length',sqlTypes.FloatType())
        ,sqlTypes.StructField('level',sqlTypes.StringType())
        ,sqlTypes.StructField('location',sqlTypes.StringType())
        ,sqlTypes.StructField('method',sqlTypes.StringType())
        ,sqlTypes.StructField('page',sqlTypes.StringType())
        ,sqlTypes.StructField('registration',sqlTypes.FloatType())
        ,sqlTypes.StructField('sessionId',sqlTypes.IntegerType())
        ,sqlTypes.StructField('song',sqlTypes.StringType())
        ,sqlTypes.StructField('status',sqlTypes.ByteType())
        ,sqlTypes.StructField('ts',sqlTypes.LongType())
        ,sqlTypes.StructField('userAgent',sqlTypes.StringType())
        ,sqlTypes.StructField('userId',sqlTypes.StringType())
    ])
    
    # UDF to create timestamp column from original unix epoch column
    @sqlFunctions.udf(sqlTypes.TimestampType())
    def epoch_to_timestamp(unix_epoch): 
        """Convert Unix Epoch values into "human-readable" timestamp format.
        
        Args:
            unix_epoch (int): Unix Epoch value to convert.
            
        Returns:
            timestamp (timestamp): Unix Epoch value conversion result.
        """

        # Unix-Epoch values are converted to human-readable timestamps
        try:
            timestamp = datetime.fromtimestamp(unix_epoch / 1000)

        # NULL values handling happens here
        except Exception:
            return None  

        return timestamp
    
    """------------------------------------------------------------------------
        Read "log-data" files from S3 Bucket
    ------------------------------------------------------------------------"""

    # get filepath to log data file
    log_data = input_data+'log_data/*/*/*.json' # <<-- S3 MULTIPLE FILES PATH

    # Read "log-data" JSON Files
    logTime, startUnixEpoch, processName = getCurrentTime(processName='log_data JSON ingestion')
    print(f'{logTime} UTC: {processName} execution started.')

    df_logs = spark.read.json(
         path=log_data
        ,schema=logsJsonSchema
    )

    logTime, completionUnixEpoch = getCurrentTime()
    print(f'{logTime} UTC: {processName} took {completionUnixEpoch - startUnixEpoch} ms to execute.')
    
    #  The "ts" column is converted into human-readable timestamp;
    df_logs = df_logs.withColumn('event_timestamp',epoch_to_timestamp(df_logs['ts']))

    """----------------------------------------------------------------------------
        create DIM_USERS dimension
    ----------------------------------------------------------------------------"""

    """----------------------------------------------------------------------------
    
        NOTES: the "dim_users" table ETL follows the steps below:
    
        1. Select columns from the "log-data" DataFrame, along with the "ts" column
    (already transformed into "event_timestamp");
        2. Use a Window Function to chronologically order each user's logged event 
    row and bring the "level" option from the immediately preceding logged event to
    the current log row being evaluated. When a "previous" event is not available
    for the user, it means the current event being evaluated is the user's first
    ever logged event. In this case, the previous "level" attribute defaults to
    'userFirstEvent';
        3. Filter records to keep only rows where a user's current subscription 
    "level" is different from its previous subscription "level".
        4. Apply another Window Function on this filtered dataset. For each listed
    user, this Window Function will order the filtered events chronologically and
    fetch to the current evaluated row the "event timestamp" of the next event row.
    When a "next row" is not available for a given user, the Window Function will
    then default to a '9999-12-31 23:59:59' timestamp, indicating this is the 
    user's current "level" option.
     
    ----------------------------------------------------------------------------"""

    #  SET WINDOW FUNCTION SPECIFICATIONS FOR CHANGE DATA CAPTURE

    # user subscription "level" option changes tracking
    levelChangeWindowSpec = Window \
        .partitionBy(sqlFunctions.col('user_id')) \
        .orderBy(sqlFunctions.col('event_timestamp'))

    userPreviousLevelOption = sqlFunctions.lag(sqlFunctions.col('level'),1,'userFirstEvent').over(levelChangeWindowSpec)

    # user subscription "level" validity timespan
    levelValidUntilWindowSpec = Window \
        .partitionBy(sqlFunctions.col('user_id')) \
        .orderBy(sqlFunctions.col('event_timestamp'))

    # expression to calculate "subscription_level_valid_until" column
    userLevelValidUntilExpression = sqlFunctions.lead(
         sqlFunctions.col('event_timestamp')
        ,1
        ,'9999-12-31 23:59:59'
    ).over(levelValidUntilWindowSpec)

    dim_users_etl = df_logs.select(
         sqlFunctions.col('userId').alias('user_id')
        ,sqlFunctions.col('firstName').alias('first_name')
        ,sqlFunctions.col('lastName').alias('last_name')
        ,sqlFunctions.col('gender')
        ,sqlFunctions.col('level')
        ,sqlFunctions.col('event_timestamp')
    ) \
    .where("user_id IS NOT NULL AND user_id <> ''") \
    .withColumn('previous_event_subscription_level',userPreviousLevelOption) \
    .where("level <> previous_event_subscription_level") \
    .withColumn('subscription_level_valid_until',userLevelValidUntilExpression) \
    .withColumn(
         'is_current_user_level'
        ,sqlFunctions.when(
             sqlFunctions.col('subscription_level_valid_until') == '9999-12-31 23:59:59'
            ,True) \
        .otherwise(False))
    
    # Select final columns for "dim_users" table
    dim_users_df = dim_users_etl.select(
         sqlFunctions.col('user_id')
        ,sqlFunctions.col('first_name')
        ,sqlFunctions.col('last_name')
        ,sqlFunctions.col('gender')
        ,sqlFunctions.col('level').alias('subscription_level')
        ,sqlFunctions.col('event_timestamp').alias('subscription_level_valid_since')
        ,sqlFunctions.col('subscription_level_valid_until')
        ,sqlFunctions.col('is_current_user_level')
    )
    
    # Write "dim_users" DataFrame to Parquet files
    logTime, startUnixEpoch, processName = getCurrentTime(processName='dim_users Parquet write')
    print(f'{logTime} UTC: {processName} execution started.')

    dim_users_df.write.parquet(
         path=output_data+'dim_users'
        ,mode='overwrite'    
    )

    logTime, completionUnixEpoch = getCurrentTime()
    print(f'{logTime} UTC: {processName} took {completionUnixEpoch - startUnixEpoch} ms to execute.')
    
    """------------------------------------------------------------------------
        create DIM_TIME dimension
    ------------------------------------------------------------------------"""

    dim_time_df = df_logs.select(
         df_logs['event_timestamp']
        ,sqlFunctions.hour(df_logs['event_timestamp']).alias('hour')
        ,sqlFunctions.dayofmonth(df_logs['event_timestamp']).alias('day_of_month')
        ,sqlFunctions.weekofyear(df_logs['event_timestamp']).alias('week_of_year')
        ,sqlFunctions.month(df_logs['event_timestamp']).alias('month')
        ,sqlFunctions.year(df_logs['event_timestamp']).alias('year')
        ,sqlFunctions.dayofweek(df_logs['event_timestamp']).alias('weekday')
    ).dropDuplicates()
    
    # WRITE PARTITIONED table as per Udacity's requirements
    logTime, startUnixEpoch, processName = getCurrentTime(processName='partitioned dim_time Parquet write')
    print(f'{logTime} UTC: {processName} execution started.')


    dim_time_df.write.parquet(
         path=output_data+'dim_time'
        ,mode='overwrite'
        ,partitionBy=['year','month']
    )

    logTime, completionUnixEpoch = getCurrentTime()
    print(f'{logTime} UTC: {processName} took {completionUnixEpoch - startUnixEpoch} ms to execute.')

    # WRITE NON-PARTITIONED table for better performance
    logTime, startUnixEpoch, processName = getCurrentTime(processName='non-partitioned dim_time Parquet write')
    print(f'{logTime} UTC: {processName} execution started.')

    dim_time_df.write.parquet(
         path=output_data+'dim_time_non_partitioned'
        ,mode='overwrite'
    )

    logTime, completionUnixEpoch = getCurrentTime()
    print(f'{logTime} UTC: {processName} took {completionUnixEpoch - startUnixEpoch} ms to execute.')

    """------------------------------------------------------------------------
        create FACT_SONGPLAYS fact table
    ------------------------------------------------------------------------"""

    #  "dim_songs" pre processed table is read back to memory, thus saving IO and
    # avoiding a repetition of ETL steps already performed.
    logTime, startUnixEpoch, processName = getCurrentTime(processName='dim_songs Parquet ingestion')
    print(f'{logTime} UTC: {processName} execution started.')

    dim_songs = spark.read.parquet(output_data+'dim_songs_non_partitioned')

    logTime, completionUnixEpoch = getCurrentTime()
    print(f'{logTime} UTC: {processName} took {completionUnixEpoch - startUnixEpoch} ms to execute.')
    
    #  "dim_artists" pre processed table is read back to memory, thus saving IO and
    # avoiding a repetition of ETL steps already performed.
    logTime, startUnixEpoch, processName = getCurrentTime(processName='dim_artists Parquet ingestion')
    print(f'{logTime} UTC: {processName} execution started.')
    
    dim_artists = spark.read.parquet(output_data+'dim_artists')

    logTime, completionUnixEpoch = getCurrentTime()
    print(f'{logTime} UTC: {processName} took {completionUnixEpoch - startUnixEpoch} ms to execute.')

    # "log_data" columns for the "fact_songplays" table are selected below
    # Add a "songplay_id" column by using a SQL function
    # 'NextSong' pages indicate a songplay event, so only these pages are kept.
    log_events = df_logs.select(
         sqlFunctions.col('event_timestamp')
        ,sqlFunctions.col('sessionId').alias('session_id')
        ,sqlFunctions.col('userId').alias('user_id')
        ,sqlFunctions.col('level')
        ,sqlFunctions.col('song')
        ,sqlFunctions.col('length')
        ,sqlFunctions.col('artist')
        ,sqlFunctions.col('location')
        ,sqlFunctions.col('userAgent').alias('user_agent')
    ) \
    .where("page = 'NextSong' AND user_id IS NOT NULL") \
    .withColumn('songplay_id',sqlFunctions.monotonically_increasing_id())

    # joins with multiple conditions must be passed as a list
    log_songs_join_conditions = [log_events['song'] == dim_songs['title'] , log_events['length'] == dim_songs['duration']]

    #  the "log_events" DataFrame is joined to both "songs" and "artists" Dimensions
    # to perform a lookup of needed attributes present in them.
    fact_songplays_df = log_events \
        .join(
             other=dim_songs
            ,on=log_songs_join_conditions
            ,how='inner'
        ) \
        .join(
             other=dim_artists
            ,on=log_events['artist'] == dim_artists['artist_name']
            ,how='inner'
        ) \
        .select(
             log_events['songplay_id']
            ,log_events['event_timestamp']
            ,log_events['user_id']
            ,log_events['level']
            ,dim_songs['song_id']
            ,dim_artists['artist_id']
            ,log_events['session_id']
            ,log_events['location']
            ,log_events['user_agent']
            ,sqlFunctions.year(log_events['event_timestamp']).alias('year')
            ,sqlFunctions.month(log_events['event_timestamp']).alias('month') 
        )

    # WRITE PARTITIONED table as per Udacity's requirements
    logTime, startUnixEpoch, processName = getCurrentTime(processName='partitioned fact_songplays Parquet write')
    print(f'{logTime} UTC: {processName} execution started.')

    fact_songplays_df.write.parquet(
         path=output_data+'fact_songplays'
        ,mode='overwrite'
        ,partitionBy=['year','month']
    )

    logTime, completionUnixEpoch = getCurrentTime()
    print(f'{logTime} UTC: {processName} took {completionUnixEpoch - startUnixEpoch} ms to execute.')

    # WRITE NON-PARTITIONED table for better performance
    logTime, startUnixEpoch, processName = getCurrentTime(processName='non-partitioned fact_songplays Parquet write')
    print(f'{logTime} UTC: {processName} execution started.')
    
    fact_songplays_df.write.parquet(
         path=output_data+'fact_songplays_non_partitioned'
        ,mode='overwrite'
    )

    logTime, completionUnixEpoch = getCurrentTime()
    print(f'{logTime} UTC: {processName} took {completionUnixEpoch - startUnixEpoch} ms to execute.')
Пример #10
0
            parameters = getattr(tuple_type, "__tuple_params__")
        else:
            parameters = getattr(tuple_type, "__args__")
        return _DataFrame([as_spark_type(t) for t in parameters])
    inner = as_spark_type(tpe)
    if inner is None:
        return _Unknown(tpe)
    else:
        return _Scalar(inner)


# First element of the list is the python base type
_base = {
    types.StringType(): [str, "str", "string"],
    types.BinaryType(): [bytes],
    types.ByteType(): [np.int8, "int8", "byte"],
    types.ShortType(): [np.int16, "int16", "short"],
    types.IntegerType(): [int, "int", np.int, np.int32],
    types.LongType(): [np.int64, "int64", "long", "bigint"],
    types.FloatType(): [float, "float", np.float],
    types.DoubleType(): [np.float64, "float64", "double"],
    types.TimestampType(): [datetime.datetime, np.datetime64],
    types.DateType(): [datetime.date],
    types.BooleanType(): [bool, "boolean", "bool", np.bool],
    types.ArrayType(types.StringType()): [],
}


def _build_type_dict():
    return dict([(other_type, spark_type) for (spark_type, l) in _base.items()
                 for other_type in l] + [(spark_type, spark_type)
Пример #11
0
from pyspark.sql import types

# base type
DType = types.DataType
# individual types
String = types.StringType()
Date = types.DateType()
Datetime = types.TimestampType()
# numeric types
Float = types.FloatType()
Double = types.DoubleType()
Byte = types.ByteType()
Short = types.ShortType()
Integer = types.IntegerType()
Long = types.LongType()
# groups
Floats = (Float, Double)
Integers = (Byte, Short, Integer, Long)
Numerics = Floats + Integers
Пример #12
0
def get_common_spark_testing_client(data_directory, connect):
    spark = (
        SparkSession.builder.appName("ibis_testing").master("local[1]").config(
            "spark.cores.max",
            1).config("spark.executor.heartbeatInterval", "3600s").config(
                "spark.executor.instances",
                1).config("spark.network.timeout", "4200s").config(
                    "spark.sql.execution.arrow.pyspark.enabled",
                    False).config("spark.sql.legacy.timeParserPolicy",
                                  "LEGACY").
        config("spark.storage.blockManagerSlaveTimeoutMs", "4200s").config(
            "spark.ui.showConsoleProgress",
            False).config('spark.default.parallelism', 1).config(
                'spark.dynamicAllocation.enabled',
                False).config('spark.rdd.compress', False).config(
                    'spark.serializer',
                    'org.apache.spark.serializer.KryoSerializer').config(
                        'spark.shuffle.compress', False).config(
                            'spark.shuffle.spill.compress',
                            False).config('spark.sql.shuffle.partitions',
                                          1).config('spark.ui.enabled',
                                                    False).getOrCreate())
    _spark_testing_client = connect(spark)
    s = _spark_testing_client._session
    num_partitions = 4

    df_functional_alltypes = (
        s.read.csv(
            path=str(data_directory / 'functional_alltypes.csv'),
            schema=pt.StructType([
                pt.StructField('index', pt.IntegerType(), True),
                pt.StructField('Unnamed: 0', pt.IntegerType(), True),
                pt.StructField('id', pt.IntegerType(), True),
                # cast below, Spark can't read 0/1 as bool
                pt.StructField('bool_col', pt.ByteType(), True),
                pt.StructField('tinyint_col', pt.ByteType(), True),
                pt.StructField('smallint_col', pt.ShortType(), True),
                pt.StructField('int_col', pt.IntegerType(), True),
                pt.StructField('bigint_col', pt.LongType(), True),
                pt.StructField('float_col', pt.FloatType(), True),
                pt.StructField('double_col', pt.DoubleType(), True),
                pt.StructField('date_string_col', pt.StringType(), True),
                pt.StructField('string_col', pt.StringType(), True),
                pt.StructField('timestamp_col', pt.TimestampType(), True),
                pt.StructField('year', pt.IntegerType(), True),
                pt.StructField('month', pt.IntegerType(), True),
            ]),
            mode='FAILFAST',
            header=True,
        ).repartition(num_partitions).sort('index'))

    df_functional_alltypes = df_functional_alltypes.withColumn(
        "bool_col", df_functional_alltypes["bool_col"].cast("boolean"))
    df_functional_alltypes.createOrReplaceTempView('functional_alltypes')

    df_batting = (s.read.csv(
        path=str(data_directory / 'batting.csv'),
        schema=pt.StructType([
            pt.StructField('playerID', pt.StringType(), True),
            pt.StructField('yearID', pt.IntegerType(), True),
            pt.StructField('stint', pt.IntegerType(), True),
            pt.StructField('teamID', pt.StringType(), True),
            pt.StructField('lgID', pt.StringType(), True),
            pt.StructField('G', pt.IntegerType(), True),
            pt.StructField('AB', pt.DoubleType(), True),
            pt.StructField('R', pt.DoubleType(), True),
            pt.StructField('H', pt.DoubleType(), True),
            pt.StructField('X2B', pt.DoubleType(), True),
            pt.StructField('X3B', pt.DoubleType(), True),
            pt.StructField('HR', pt.DoubleType(), True),
            pt.StructField('RBI', pt.DoubleType(), True),
            pt.StructField('SB', pt.DoubleType(), True),
            pt.StructField('CS', pt.DoubleType(), True),
            pt.StructField('BB', pt.DoubleType(), True),
            pt.StructField('SO', pt.DoubleType(), True),
            pt.StructField('IBB', pt.DoubleType(), True),
            pt.StructField('HBP', pt.DoubleType(), True),
            pt.StructField('SH', pt.DoubleType(), True),
            pt.StructField('SF', pt.DoubleType(), True),
            pt.StructField('GIDP', pt.DoubleType(), True),
        ]),
        header=True,
    ).repartition(num_partitions).sort('playerID'))
    df_batting.createOrReplaceTempView("batting")

    df_awards_players = (s.read.csv(
        path=str(data_directory / 'awards_players.csv'),
        schema=pt.StructType([
            pt.StructField('playerID', pt.StringType(), True),
            pt.StructField('awardID', pt.StringType(), True),
            pt.StructField('yearID', pt.IntegerType(), True),
            pt.StructField('lgID', pt.StringType(), True),
            pt.StructField('tie', pt.StringType(), True),
            pt.StructField('notes', pt.StringType(), True),
        ]),
        header=True,
    ).repartition(num_partitions).sort('playerID'))
    df_awards_players.createOrReplaceTempView('awards_players')

    df_simple = s.createDataFrame([(1, 'a')], ['foo', 'bar'])
    df_simple.createOrReplaceTempView('simple')

    df_struct = s.createDataFrame([((1, 2, 'a'), )], ['struct_col'])
    df_struct.createOrReplaceTempView('struct')

    df_nested_types = s.createDataFrame(
        [([1, 2], [[3, 4], [5, 6]], {
            'a': [[2, 4], [3, 5]]
        })],
        [
            'list_of_ints',
            'list_of_list_of_ints',
            'map_string_list_of_list_of_ints',
        ],
    )
    df_nested_types.createOrReplaceTempView('nested_types')

    df_complicated = s.createDataFrame([({
        (1, 3): [[2, 4], [3, 5]]
    }, )], ['map_tuple_list_of_list_of_ints'])
    df_complicated.createOrReplaceTempView('complicated')

    df_udf = s.createDataFrame(
        [('a', 1, 4.0, 'a'), ('b', 2, 5.0, 'a'), ('c', 3, 6.0, 'b')],
        ['a', 'b', 'c', 'key'],
    )
    df_udf.createOrReplaceTempView('udf')

    df_udf_nan = s.createDataFrame(
        pd.DataFrame({
            'a': np.arange(10, dtype=float),
            'b': [3.0, np.NaN] * 5,
            'key': list('ddeefffggh'),
        }))
    df_udf_nan.createOrReplaceTempView('udf_nan')

    df_udf_null = s.createDataFrame(
        [(float(i), None if i % 2 else 3.0, 'ddeefffggh'[i])
         for i in range(10)],
        ['a', 'b', 'key'],
    )
    df_udf_null.createOrReplaceTempView('udf_null')

    df_udf_random = s.createDataFrame(
        pd.DataFrame({
            'a':
            np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(),
            'b':
            np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(),
            'key':
            list('ddeefff'),
        }))
    df_udf_random.createOrReplaceTempView('udf_random')

    return _spark_testing_client
Пример #13
0
def spark_client_testing(data_directory):
    pytest.importorskip('pyspark')

    import pyspark.sql.types as pt

    client = ibis.spark.connect()

    df_functional_alltypes = client._session.read.csv(
        path=str(data_directory / 'functional_alltypes.csv'),
        schema=pt.StructType([
            pt.StructField('index', pt.IntegerType(), True),
            pt.StructField('Unnamed: 0', pt.IntegerType(), True),
            pt.StructField('id', pt.IntegerType(), True),
            # cast below, Spark can't read 0/1 as bool
            pt.StructField('bool_col', pt.ByteType(), True),
            pt.StructField('tinyint_col', pt.ByteType(), True),
            pt.StructField('smallint_col', pt.ShortType(), True),
            pt.StructField('int_col', pt.IntegerType(), True),
            pt.StructField('bigint_col', pt.LongType(), True),
            pt.StructField('float_col', pt.FloatType(), True),
            pt.StructField('double_col', pt.DoubleType(), True),
            pt.StructField('date_string_col', pt.StringType(), True),
            pt.StructField('string_col', pt.StringType(), True),
            pt.StructField('timestamp_col', pt.TimestampType(), True),
            pt.StructField('year', pt.IntegerType(), True),
            pt.StructField('month', pt.IntegerType(), True),
        ]),
        mode='FAILFAST',
        header=True,
    )
    df_functional_alltypes = df_functional_alltypes.withColumn(
        "bool_col", df_functional_alltypes["bool_col"].cast("boolean"))
    df_functional_alltypes.createOrReplaceTempView('functional_alltypes')

    df_batting = client._session.read.csv(
        path=str(data_directory / 'batting.csv'),
        schema=pt.StructType([
            pt.StructField('playerID', pt.StringType(), True),
            pt.StructField('yearID', pt.IntegerType(), True),
            pt.StructField('stint', pt.IntegerType(), True),
            pt.StructField('teamID', pt.StringType(), True),
            pt.StructField('lgID', pt.StringType(), True),
            pt.StructField('G', pt.IntegerType(), True),
            pt.StructField('AB', pt.DoubleType(), True),
            pt.StructField('R', pt.DoubleType(), True),
            pt.StructField('H', pt.DoubleType(), True),
            pt.StructField('X2B', pt.DoubleType(), True),
            pt.StructField('X3B', pt.DoubleType(), True),
            pt.StructField('HR', pt.DoubleType(), True),
            pt.StructField('RBI', pt.DoubleType(), True),
            pt.StructField('SB', pt.DoubleType(), True),
            pt.StructField('CS', pt.DoubleType(), True),
            pt.StructField('BB', pt.DoubleType(), True),
            pt.StructField('SO', pt.DoubleType(), True),
            pt.StructField('IBB', pt.DoubleType(), True),
            pt.StructField('HBP', pt.DoubleType(), True),
            pt.StructField('SH', pt.DoubleType(), True),
            pt.StructField('SF', pt.DoubleType(), True),
            pt.StructField('GIDP', pt.DoubleType(), True),
        ]),
        header=True,
    )
    df_batting.createOrReplaceTempView('batting')

    df_awards_players = client._session.read.csv(
        path=str(data_directory / 'awards_players.csv'),
        schema=pt.StructType([
            pt.StructField('playerID', pt.StringType(), True),
            pt.StructField('awardID', pt.StringType(), True),
            pt.StructField('yearID', pt.IntegerType(), True),
            pt.StructField('lgID', pt.StringType(), True),
            pt.StructField('tie', pt.StringType(), True),
            pt.StructField('notes', pt.StringType(), True),
        ]),
        header=True,
    )
    df_awards_players.createOrReplaceTempView('awards_players')

    df_simple = client._session.createDataFrame([(1, 'a')], ['foo', 'bar'])
    df_simple.createOrReplaceTempView('simple')

    df_struct = client._session.createDataFrame([((1, 2, 'a'), )],
                                                ['struct_col'])
    df_struct.createOrReplaceTempView('struct')

    df_nested_types = client._session.createDataFrame([(
        [1, 2],
        [[3, 4], [5, 6]],
        {
            'a': [[2, 4], [3, 5]]
        },
    )], [
        'list_of_ints', 'list_of_list_of_ints',
        'map_string_list_of_list_of_ints'
    ])
    df_nested_types.createOrReplaceTempView('nested_types')

    df_complicated = client._session.createDataFrame([({
        (1, 3): [[2, 4], [3, 5]]
    }, )], ['map_tuple_list_of_list_of_ints'])
    df_complicated.createOrReplaceTempView('complicated')

    return client
Пример #14
0

PRIMITIVES = [
    (NoneType, t.NullType()),
    (int, t.IntegerType()),
    (float, t.DoubleType()),
    (str, t.StringType()),
    (bytes, t.BinaryType()),
    (bytearray, t.BinaryType()),
    (bool, t.BooleanType()),
]

SYNTHETIC_PRIMITIVES = [
    (long, t.LongType()),
    (short, t.ShortType()),
    (byte, t.ByteType()),
]

DATE_TYPES = [
    (date, t.DateType()),
    (datetime, t.TimestampType()),
]

OPTIONALS_PY = [
    (str, (False, str)),
    (decimal(1, 2), (False, decimal(1, 2))),
    (Optional[decimal(3, 4)], (True, decimal(3, 4))),
    (List[str], (False, List[str])),
    (Optional[str], (True, str)),
    (Optional[List[str]], (True, List[str])),
]
Пример #15
0
def _to_stype(tpe) -> X:
    if _is_col(tpe):
        inner = as_spark_type(_get_col_inner(tpe))
        return _Column(inner)
    inner = as_spark_type(tpe)
    if inner is None:
        return _Unknown(tpe)
    else:
        return _Regular(inner)


# First element of the list is the python base type
_base = {
    types.StringType(): [str, 'str', 'string'],
    types.ByteType(): [np.int8, 'int8', 'byte'],
    types.ShortType(): [np.int16, 'int16', 'short'],
    types.IntegerType(): [int, 'int', np.int],
    types.LongType(): [np.int64, 'int64', 'long', 'bigint'],
    types.FloatType(): [float, 'float', np.float],
    types.DoubleType(): [np.float64, 'float64', 'double'],
    types.TimestampType(): [np.datetime64],
    types.BooleanType(): [bool, 'boolean', 'bool', np.bool],
}


def _build_type_dict():
    return dict([(other_type, spark_type) for (spark_type, l) in _base.items()
                 for other_type in l])

Пример #16
0
def recommend():
    json_data = request.get_json()

    user_ratings = json_data['user_ratings']
    keyword = str(json_data['keyword']).lower()
    location = str(json_data['location']).lower()

    user_ratings_data = list(user_ratings.items())

    # Define schema
    schema = T.StructType([
        T.StructField('item', T.StringType(), True),
        T.StructField('rating', T.StringType(), True)
    ])

    user_ratings_df = spark.createDataFrame(user_ratings_data, schema=schema)

    user_ratings_df = (user_ratings_df.select(
        F.col('item').cast(T.IntegerType()).alias('item'),
        F.col('rating').cast(T.ByteType()).alias('rating')))

    # user_ratings_df.printSchema()
    # print(user_ratings_df.head(10))

    predicted_rating_df = make_new_user_predictions(user_ratings_df)

    prediction_data_df = (predicted_rating_df.join(
        restaurants_with_id_df, on='item'
    ).filter(
        (F.lower(F.col('name')).like('%{}%'.format(keyword))
         | find_str_in_categories_udf(F.col('categories'), F.lit(keyword)))
        & (F.lower(F.col('location.city')).like('%{}%'.format(location))
           | F.lower(F.col('location.address1')).like('%{}%'.format(location))
           | F.lower(F.col('location.address2')).like('%{}%'.format(location))
           | F.lower(F.col('location.address3')).like('%{}%'.format(location))
           | F.lower(F.col('location.zip_code')).like('%{}%'.format(location))
           | F.lower(F.col('location.state')).like('%{}%'.format(location)))
    ).join(user_ratings_df.select(F.col('item'),
                                  F.col('rating').alias('user_rating')),
           on='item',
           how='left_outer').filter(F.isnull('user_rating')).sort(
               F.col('prediction'), ascending=False))

    # prediction_data_df.printSchema()
    # print(prediction_data_df.show(20, truncate=False))

    results = {}
    for i, row in enumerate(prediction_data_df.take(10)):
        results[i] = {
            'model_id': row['item'],
            'prediction': row['prediction'],
            'name': row['name'],
            'url': row['url'],
            'image_url': row['image_url'],
            'location': row['location'],
            'rating': row['rating'],
            'count_item_rating': row['count_item_rating'],
            'item_bias': row['item_bias'],
            'res_prediction': row['res_prediction'],
            'categories': row['categories']
        }

    return jsonify(results)
Пример #17
0
def as_spark_type(tpe: typing.Union[str, type, Dtype],
                  *,
                  raise_error: bool = True) -> types.DataType:
    """
    Given a Python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in Python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - Python3's typing system
    """
    # TODO: Add "boolean" and "string" types.
    # ArrayType
    if tpe in (np.ndarray, ):
        return types.ArrayType(types.StringType())
    elif hasattr(tpe, "__origin__") and issubclass(tpe.__origin__,
                                                   list):  # type: ignore
        element_type = as_spark_type(tpe.__args__[0],
                                     raise_error=raise_error)  # type: ignore
        if element_type is None:
            return None
        return types.ArrayType(element_type)
    # BinaryType
    elif tpe in (bytes, np.character, np.bytes_, np.string_):
        return types.BinaryType()
    # BooleanType
    elif tpe in (bool, np.bool, "bool", "?"):
        return types.BooleanType()
    # DateType
    elif tpe in (datetime.date, ):
        return types.DateType()
    # NumericType
    elif tpe in (np.int8, np.byte, "int8", "byte", "b"):
        return types.ByteType()
    elif tpe in (decimal.Decimal, ):
        # TODO: considering about the precision & scale for decimal type.
        return types.DecimalType(38, 18)
    elif tpe in (float, np.float, np.float64, "float", "float64", "double"):
        return types.DoubleType()
    elif tpe in (np.float32, "float32", "f"):
        return types.FloatType()
    elif tpe in (np.int32, "int32", "i"):
        return types.IntegerType()
    elif tpe in (int, np.int, np.int64, "int", "int64", "long"):
        return types.LongType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    # StringType
    elif tpe in (str, np.unicode_, "str", "U"):
        return types.StringType()
    # TimestampType
    elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"):
        return types.TimestampType()

    # categorical types
    elif isinstance(tpe, CategoricalDtype) or (isinstance(tpe, str)
                                               and type == "category"):
        return types.LongType()

    # extension types
    elif extension_dtypes_available:
        # IntegralType
        if isinstance(tpe, Int8Dtype) or (isinstance(tpe, str)
                                          and tpe == "Int8"):
            return types.ByteType()
        elif isinstance(tpe, Int16Dtype) or (isinstance(tpe, str)
                                             and tpe == "Int16"):
            return types.ShortType()
        elif isinstance(tpe, Int32Dtype) or (isinstance(tpe, str)
                                             and tpe == "Int32"):
            return types.IntegerType()
        elif isinstance(tpe, Int64Dtype) or (isinstance(tpe, str)
                                             and tpe == "Int64"):
            return types.LongType()

        if extension_object_dtypes_available:
            # BooleanType
            if isinstance(tpe, BooleanDtype) or (isinstance(tpe, str)
                                                 and tpe == "boolean"):
                return types.BooleanType()
            # StringType
            elif isinstance(tpe, StringDtype) or (isinstance(tpe, str)
                                                  and tpe == "string"):
                return types.StringType()

        if extension_float_dtypes_available:
            # FractionalType
            if isinstance(tpe, Float32Dtype) or (isinstance(tpe, str)
                                                 and tpe == "Float32"):
                return types.FloatType()
            elif isinstance(tpe, Float64Dtype) or (isinstance(tpe, str)
                                                   and tpe == "Float64"):
                return types.DoubleType()

    if raise_error:
        raise TypeError("Type %s was not understood." % tpe)
    else:
        return None
Пример #18
0
    def test_datatype_with_structs_arrays_and_maps(self):
        first = T.StructType([
            T.StructField('f1', T.BooleanType()),
            T.StructField(
                'f2',
                T.StructType([
                    T.StructField(
                        's1',
                        T.ArrayType(T.StructType([
                            T.StructField('ss1', T.BooleanType()),
                            T.StructField('ss2', T.ByteType())
                        ])),
                    ),
                ]),
            ),
            T.StructField('f3', T.ArrayType(T.MapType(T.StringType(), T.IntegerType()))),
            T.StructField(
                'f4',
                T.MapType(
                    T.StringType(),
                    T.StructType([
                        T.StructField('ss1', T.IntegerType()),
                        T.StructField('ss2', T.LongType()),
                    ])),
            ),
        ])
        second = T.StructType([
            T.StructField('f3', T.ArrayType(T.MapType(T.StringType(), T.IntegerType()))),
            T.StructField(
                'f2',
                T.StructType([
                    T.StructField(
                        's1',
                        T.ArrayType(T.StructType([
                            T.StructField('ss2', T.ByteType()),
                            T.StructField('ss1', T.BooleanType()),
                        ])),
                    ),
                ]),
            ),
            T.StructField(
                'f4',
                T.MapType(
                    T.StringType(),
                    T.StructType([
                        T.StructField('ss2', T.LongType()),
                        T.StructField('ss1', T.IntegerType()),
                    ])),
            ),
            T.StructField('f1', T.BooleanType()),
        ])

        SparklyTest().assertRowsEqual(first, second, ignore_order=True)
        with self.assertRaises(AssertionError):
            self.assertEqual(first, second)

        # change entry (f4.ss1, T.LongType)
        second = T.StructType([
            T.StructField('f3', T.ArrayType(T.MapType(T.StringType(), T.IntegerType()))),
            T.StructField(
                'f2',
                T.StructType([
                    T.StructField(
                        's1',
                        T.ArrayType(T.StructType([
                            T.StructField('ss2', T.ByteType()),
                            T.StructField('ss1', T.BooleanType()),
                        ])),
                    ),
                ]),
            ),
            T.StructField(
                'f4',
                T.MapType(
                    T.StringType(),
                    T.StructType([
                        T.StructField('ss2', T.LongType()),
                        T.StructField('ss1', T.LongType()),
                    ])),
            ),
            T.StructField('f1', T.BooleanType()),
        ])

        with self.assertRaises(AssertionError):
            SparklyTest().assertRowsEqual(first, second, ignore_order=True)