def _numpy_to_spark_mapping(): """Returns a mapping from numpy to pyspark.sql type. Caches the mapping dictionary inorder to avoid instantiation of multiple objects in each call.""" # Refer to the attribute of the function we use to cache the map using a name in the variable instead of a 'dot' # notation to avoid copy/paste/typo mistakes cache_attr_name = 'cached_numpy_to_pyspark_types_map' if not hasattr(_numpy_to_spark_mapping, cache_attr_name): import pyspark.sql.types as T setattr(_numpy_to_spark_mapping, cache_attr_name, { np.int8: T.ByteType(), np.uint8: T.ShortType(), np.int16: T.ShortType(), np.uint16: T.IntegerType(), np.int32: T.IntegerType(), np.int64: T.LongType(), np.float32: T.FloatType(), np.float64: T.DoubleType(), np.string_: T.StringType(), np.str_: T.StringType(), np.unicode_: T.StringType(), np.bool_: T.BooleanType(), }) return getattr(_numpy_to_spark_mapping, cache_attr_name)
def as_spark_type(tpe) -> types.DataType: """ Given a python type, returns the equivalent spark type. Accepts: - the built-in types in python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - python3's typing system """ if tpe in (str, "str", "string"): return types.StringType() elif tpe in (bytes, ): return types.BinaryType() elif tpe in (np.int8, "int8", "byte"): return types.ByteType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() elif tpe in (int, "int", np.int, np.int32): return types.IntegerType() elif tpe in (np.int64, "int64", "long", "bigint"): return types.LongType() elif tpe in (float, "float", np.float): return types.FloatType() elif tpe in (np.float64, "float64", "double"): return types.DoubleType() elif tpe in (datetime.datetime, np.datetime64): return types.TimestampType() elif tpe in (datetime.date, ): return types.DateType() elif tpe in (bool, "boolean", "bool", np.bool): return types.BooleanType() elif tpe in (): return types.ArrayType(types.StringType())
def infer_spark_type(typeclass) -> t.DataType: if typeclass in (None, NoneType): return t.NullType() elif typeclass is str: return t.StringType() elif typeclass in {bytes, bytearray}: return t.BinaryType() elif typeclass is bool: return t.BooleanType() elif typeclass is date: return t.DateType() elif typeclass is datetime: return t.TimestampType() elif typeclass is Decimal: return t.DecimalType(precision=36, scale=6) elif isinstance(typeclass, type) and issubclass(typeclass, BoundDecimal): (precision, scale) = typeclass.__constraints__ return t.DecimalType(precision=precision, scale=scale) elif typeclass is float: return t.DoubleType() elif typeclass is int: return t.IntegerType() elif typeclass is long: return t.LongType() elif typeclass is short: return t.ShortType() elif typeclass is byte: return t.ByteType() elif getattr(typeclass, "__origin__", None) is not None: return infer_complex_spark_type(typeclass) elif is_pyspark_class(typeclass): return transform(typeclass) else: raise TypeError(f"Don't know how to represent {typeclass} in Spark")
def as_spark_type(tpe) -> types.DataType: """ Given a Python type, returns the equivalent spark type. Accepts: - the built-in types in Python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - Python3's typing system """ # TODO: Add "boolean" and "string" types. # ArrayType if tpe in (np.ndarray,): return types.ArrayType(types.StringType()) elif hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, list): return types.ArrayType(as_spark_type(tpe.__args__[0])) # BinaryType elif tpe in (bytes, np.character, np.bytes_, np.string_): return types.BinaryType() # BooleanType elif tpe in (bool, np.bool, "bool", "?"): return types.BooleanType() # DateType elif tpe in (datetime.date,): return types.DateType() # NumericType elif tpe in (np.int8, np.byte, "int8", "byte", "b"): return types.ByteType() elif tpe in (decimal.Decimal,): # TODO: considering about the precision & scale for decimal type. return types.DecimalType(38, 18) elif tpe in (float, np.float, np.float64, "float", "float64", "double"): return types.DoubleType() elif tpe in (np.float32, "float32", "f"): return types.FloatType() elif tpe in (np.int32, "int32", "i"): return types.IntegerType() elif tpe in (int, np.int, np.int64, "int", "int64", "long", "bigint"): return types.LongType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() # StringType elif tpe in (str, np.unicode_, "str", "U"): return types.StringType() # TimestampType elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"): return types.TimestampType() else: raise TypeError("Type %s was not understood." % tpe)
def as_spark_type(tpe) -> types.DataType: """ Given a python type, returns the equivalent spark type. Accepts: - the built-in types in python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - python3's typing system """ if tpe in (str, "str", "string"): return types.StringType() elif tpe in (bytes, ): return types.BinaryType() elif tpe in (np.int8, "int8", "byte"): return types.ByteType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() elif tpe in (int, "int", np.int, np.int32): return types.IntegerType() elif tpe in (np.int64, "int64", "long", "bigint"): return types.LongType() elif tpe in (float, "float", np.float): return types.FloatType() elif tpe in (np.float64, "float64", "double"): return types.DoubleType() elif tpe in (decimal.Decimal, ): return types.DecimalType(38, 18) elif tpe in (datetime.datetime, np.datetime64): return types.TimestampType() elif tpe in (datetime.date, ): return types.DateType() elif tpe in (bool, "boolean", "bool", np.bool): return types.BooleanType() elif tpe in (np.ndarray, ): # TODO: support other child types return types.ArrayType(types.StringType()) else: raise TypeError("Type %s was not understood." % tpe)
def as_spark_type(tpe: Union[str, type, Dtype], *, raise_error: bool = True, prefer_timestamp_ntz: bool = False) -> types.DataType: """ Given a Python type, returns the equivalent spark type. Accepts: - the built-in types in Python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - Python3's typing system """ # For NumPy typing, NumPy version should be 1.21+ and Python version should be 3.8+ if sys.version_info >= (3, 8) and LooseVersion( np.__version__) >= LooseVersion("1.21"): if (hasattr(tpe, "__origin__") and tpe.__origin__ is np.ndarray # type: ignore[union-attr] and hasattr(tpe, "__args__") and len(tpe.__args__) > 1 # type: ignore[union-attr] ): # numpy.typing.NDArray return types.ArrayType( as_spark_type( tpe.__args__[1].__args__[0], raise_error=raise_error # type: ignore[union-attr] )) if isinstance(tpe, np.dtype) and tpe == np.dtype("object"): pass # ArrayType elif tpe in (np.ndarray, ): return types.ArrayType(types.StringType()) elif hasattr(tpe, "__origin__") and issubclass( tpe.__origin__, list # type: ignore[union-attr] ): element_type = as_spark_type( tpe.__args__[0], raise_error=raise_error # type: ignore[union-attr] ) if element_type is None: return None return types.ArrayType(element_type) # BinaryType elif tpe in (bytes, np.character, np.bytes_, np.string_): return types.BinaryType() # BooleanType elif tpe in (bool, np.bool_, "bool", "?"): return types.BooleanType() # DateType elif tpe in (datetime.date, ): return types.DateType() # NumericType elif tpe in (np.int8, np.byte, "int8", "byte", "b"): return types.ByteType() elif tpe in (decimal.Decimal, ): # TODO: considering about the precision & scale for decimal type. return types.DecimalType(38, 18) elif tpe in (float, np.float_, np.float64, "float", "float64", "double"): return types.DoubleType() elif tpe in (np.float32, "float32", "f"): return types.FloatType() elif tpe in (np.int32, "int32", "i"): return types.IntegerType() elif tpe in (int, np.int64, "int", "int64", "long"): return types.LongType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() # StringType elif tpe in (str, np.unicode_, "str", "U"): return types.StringType() # TimestampType or TimestampNTZType if timezone is not specified. elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"): return types.TimestampNTZType( ) if prefer_timestamp_ntz else types.TimestampType() # categorical types elif isinstance(tpe, CategoricalDtype) or (isinstance(tpe, str) and type == "category"): return types.LongType() # extension types elif extension_dtypes_available: # IntegralType if isinstance(tpe, Int8Dtype) or (isinstance(tpe, str) and tpe == "Int8"): return types.ByteType() elif isinstance(tpe, Int16Dtype) or (isinstance(tpe, str) and tpe == "Int16"): return types.ShortType() elif isinstance(tpe, Int32Dtype) or (isinstance(tpe, str) and tpe == "Int32"): return types.IntegerType() elif isinstance(tpe, Int64Dtype) or (isinstance(tpe, str) and tpe == "Int64"): return types.LongType() if extension_object_dtypes_available: # BooleanType if isinstance(tpe, BooleanDtype) or (isinstance(tpe, str) and tpe == "boolean"): return types.BooleanType() # StringType elif isinstance(tpe, StringDtype) or (isinstance(tpe, str) and tpe == "string"): return types.StringType() if extension_float_dtypes_available: # FractionalType if isinstance(tpe, Float32Dtype) or (isinstance(tpe, str) and tpe == "Float32"): return types.FloatType() elif isinstance(tpe, Float64Dtype) or (isinstance(tpe, str) and tpe == "Float64"): return types.DoubleType() if raise_error: raise TypeError("Type %s was not understood." % tpe) else: return None
"PolyPhen", ) ) parsed_df.printSchema() # + # x = parsed_df.filter(f.col("Condel") != '-').limit(10).toPandas() # x # - dtypes = { "consequence": t.ArrayType(t.StringType()), "Existing_variation": t.ArrayType(t.StringType()), "ALLELE_NUM": t.IntegerType(), "DISTANCE": t.IntegerType(), "STRAND": t.ShortType(), "FLAGS": t.ArrayType(t.StringType()), "HGNC_ID": t.IntegerType(), # "CANONICAL": t.BooleanType(), # needs manual check if column equals "CANONICAL" "TREMBL": t.ArrayType(t.StringType()), "REFSEQ_MATCH": t.ArrayType(t.StringType()), "GENE_PHENO": t.BooleanType(), "sift_score": t.FloatType(), "polyphen_score": t.FloatType(), "EXON": t.ArrayType(t.IntegerType()), "INTRON": t.ArrayType(t.IntegerType()), "HGVS_OFFSET": t.IntegerType(), "AF": t.ArrayType(t.FloatType()), "AFR_AF": t.ArrayType(t.FloatType()), "AMR_AF": t.ArrayType(t.FloatType()), "EAS_AF": t.ArrayType(t.FloatType()),
types.StructField( field, types.StructType( [types.StructField('0', types.DoubleType())]))) elif 'Altitude' in field or 'diff_' in field: schema_fields.add( types.StructField( field, types.StructType( [types.StructField('0', types.FloatType())]))) elif 'RSSI_' in field: schema_fields.add( types.StructField( field, types.StructType( [types.StructField('0', types.ShortType())]))) elif 'tmp_' in field: schema_fields.add( types.StructField( field, types.StructType( [types.StructField('0', types.LongType())]))) elif 'mean' in field: schema_fields.add( types.StructField( field, types.StructType( [types.StructField('0', types.DoubleType())]))) else: schema_fields.add( types.StructField(
def _to_stype(tpe) -> X: if _is_col(tpe): inner = as_spark_type(_get_col_inner(tpe)) return _Column(inner) inner = as_spark_type(tpe) if inner is None: return _Unknown(tpe) else: return _Regular(inner) # First element of the list is the python base type _base = { types.StringType(): [str, 'str', 'string'], types.ByteType(): [np.int8, 'int8', 'byte'], types.ShortType(): [np.int16, 'int16', 'short'], types.IntegerType(): [int, 'int', np.int], types.LongType(): [np.int64, 'int64', 'long', 'bigint'], types.FloatType(): [float, 'float', np.float], types.DoubleType(): [np.float64, 'float64', 'double'], types.TimestampType(): [np.datetime64], types.BooleanType(): [bool, 'boolean', 'bool', np.bool], } def _build_type_dict(): return dict([(other_type, spark_type) for (spark_type, l) in _base.items() for other_type in l]) def _build_py_type_dict():
from pyspark.sql import types # base type DType = types.DataType # individual types String = types.StringType() Date = types.DateType() Datetime = types.TimestampType() # numeric types Float = types.FloatType() Double = types.DoubleType() Byte = types.ByteType() Short = types.ShortType() Integer = types.IntegerType() Long = types.LongType() # groups Floats = (Float, Double) Integers = (Byte, Short, Integer, Long) Numerics = Floats + Integers
def make_udf(threshs): return F.udf(lambda x: sum([x > y for y in threshs]), T.ShortType())
def get_common_spark_testing_client(data_directory, connect): spark = ( SparkSession.builder.appName("ibis_testing").master("local[1]").config( "spark.cores.max", 1).config("spark.executor.heartbeatInterval", "3600s").config( "spark.executor.instances", 1).config("spark.network.timeout", "4200s").config( "spark.sql.execution.arrow.pyspark.enabled", False).config("spark.sql.legacy.timeParserPolicy", "LEGACY"). config("spark.storage.blockManagerSlaveTimeoutMs", "4200s").config( "spark.ui.showConsoleProgress", False).config('spark.default.parallelism', 1).config( 'spark.dynamicAllocation.enabled', False).config('spark.rdd.compress', False).config( 'spark.serializer', 'org.apache.spark.serializer.KryoSerializer').config( 'spark.shuffle.compress', False).config( 'spark.shuffle.spill.compress', False).config('spark.sql.shuffle.partitions', 1).config('spark.ui.enabled', False).getOrCreate()) _spark_testing_client = connect(spark) s = _spark_testing_client._session num_partitions = 4 df_functional_alltypes = ( s.read.csv( path=str(data_directory / 'functional_alltypes.csv'), schema=pt.StructType([ pt.StructField('index', pt.IntegerType(), True), pt.StructField('Unnamed: 0', pt.IntegerType(), True), pt.StructField('id', pt.IntegerType(), True), # cast below, Spark can't read 0/1 as bool pt.StructField('bool_col', pt.ByteType(), True), pt.StructField('tinyint_col', pt.ByteType(), True), pt.StructField('smallint_col', pt.ShortType(), True), pt.StructField('int_col', pt.IntegerType(), True), pt.StructField('bigint_col', pt.LongType(), True), pt.StructField('float_col', pt.FloatType(), True), pt.StructField('double_col', pt.DoubleType(), True), pt.StructField('date_string_col', pt.StringType(), True), pt.StructField('string_col', pt.StringType(), True), pt.StructField('timestamp_col', pt.TimestampType(), True), pt.StructField('year', pt.IntegerType(), True), pt.StructField('month', pt.IntegerType(), True), ]), mode='FAILFAST', header=True, ).repartition(num_partitions).sort('index')) df_functional_alltypes = df_functional_alltypes.withColumn( "bool_col", df_functional_alltypes["bool_col"].cast("boolean")) df_functional_alltypes.createOrReplaceTempView('functional_alltypes') df_batting = (s.read.csv( path=str(data_directory / 'batting.csv'), schema=pt.StructType([ pt.StructField('playerID', pt.StringType(), True), pt.StructField('yearID', pt.IntegerType(), True), pt.StructField('stint', pt.IntegerType(), True), pt.StructField('teamID', pt.StringType(), True), pt.StructField('lgID', pt.StringType(), True), pt.StructField('G', pt.IntegerType(), True), pt.StructField('AB', pt.DoubleType(), True), pt.StructField('R', pt.DoubleType(), True), pt.StructField('H', pt.DoubleType(), True), pt.StructField('X2B', pt.DoubleType(), True), pt.StructField('X3B', pt.DoubleType(), True), pt.StructField('HR', pt.DoubleType(), True), pt.StructField('RBI', pt.DoubleType(), True), pt.StructField('SB', pt.DoubleType(), True), pt.StructField('CS', pt.DoubleType(), True), pt.StructField('BB', pt.DoubleType(), True), pt.StructField('SO', pt.DoubleType(), True), pt.StructField('IBB', pt.DoubleType(), True), pt.StructField('HBP', pt.DoubleType(), True), pt.StructField('SH', pt.DoubleType(), True), pt.StructField('SF', pt.DoubleType(), True), pt.StructField('GIDP', pt.DoubleType(), True), ]), header=True, ).repartition(num_partitions).sort('playerID')) df_batting.createOrReplaceTempView("batting") df_awards_players = (s.read.csv( path=str(data_directory / 'awards_players.csv'), schema=pt.StructType([ pt.StructField('playerID', pt.StringType(), True), pt.StructField('awardID', pt.StringType(), True), pt.StructField('yearID', pt.IntegerType(), True), pt.StructField('lgID', pt.StringType(), True), pt.StructField('tie', pt.StringType(), True), pt.StructField('notes', pt.StringType(), True), ]), header=True, ).repartition(num_partitions).sort('playerID')) df_awards_players.createOrReplaceTempView('awards_players') df_simple = s.createDataFrame([(1, 'a')], ['foo', 'bar']) df_simple.createOrReplaceTempView('simple') df_struct = s.createDataFrame([((1, 2, 'a'), )], ['struct_col']) df_struct.createOrReplaceTempView('struct') df_nested_types = s.createDataFrame( [([1, 2], [[3, 4], [5, 6]], { 'a': [[2, 4], [3, 5]] })], [ 'list_of_ints', 'list_of_list_of_ints', 'map_string_list_of_list_of_ints', ], ) df_nested_types.createOrReplaceTempView('nested_types') df_complicated = s.createDataFrame([({ (1, 3): [[2, 4], [3, 5]] }, )], ['map_tuple_list_of_list_of_ints']) df_complicated.createOrReplaceTempView('complicated') df_udf = s.createDataFrame( [('a', 1, 4.0, 'a'), ('b', 2, 5.0, 'a'), ('c', 3, 6.0, 'b')], ['a', 'b', 'c', 'key'], ) df_udf.createOrReplaceTempView('udf') df_udf_nan = s.createDataFrame( pd.DataFrame({ 'a': np.arange(10, dtype=float), 'b': [3.0, np.NaN] * 5, 'key': list('ddeefffggh'), })) df_udf_nan.createOrReplaceTempView('udf_nan') df_udf_null = s.createDataFrame( [(float(i), None if i % 2 else 3.0, 'ddeefffggh'[i]) for i in range(10)], ['a', 'b', 'key'], ) df_udf_null.createOrReplaceTempView('udf_null') df_udf_random = s.createDataFrame( pd.DataFrame({ 'a': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), 'b': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), 'key': list('ddeefff'), })) df_udf_random.createOrReplaceTempView('udf_random') return _spark_testing_client
def spark_client_testing(data_directory): pytest.importorskip('pyspark') import pyspark.sql.types as pt client = ibis.spark.connect() df_functional_alltypes = client._session.read.csv( path=str(data_directory / 'functional_alltypes.csv'), schema=pt.StructType([ pt.StructField('index', pt.IntegerType(), True), pt.StructField('Unnamed: 0', pt.IntegerType(), True), pt.StructField('id', pt.IntegerType(), True), # cast below, Spark can't read 0/1 as bool pt.StructField('bool_col', pt.ByteType(), True), pt.StructField('tinyint_col', pt.ByteType(), True), pt.StructField('smallint_col', pt.ShortType(), True), pt.StructField('int_col', pt.IntegerType(), True), pt.StructField('bigint_col', pt.LongType(), True), pt.StructField('float_col', pt.FloatType(), True), pt.StructField('double_col', pt.DoubleType(), True), pt.StructField('date_string_col', pt.StringType(), True), pt.StructField('string_col', pt.StringType(), True), pt.StructField('timestamp_col', pt.TimestampType(), True), pt.StructField('year', pt.IntegerType(), True), pt.StructField('month', pt.IntegerType(), True), ]), mode='FAILFAST', header=True, ) df_functional_alltypes = df_functional_alltypes.withColumn( "bool_col", df_functional_alltypes["bool_col"].cast("boolean")) df_functional_alltypes.createOrReplaceTempView('functional_alltypes') df_batting = client._session.read.csv( path=str(data_directory / 'batting.csv'), schema=pt.StructType([ pt.StructField('playerID', pt.StringType(), True), pt.StructField('yearID', pt.IntegerType(), True), pt.StructField('stint', pt.IntegerType(), True), pt.StructField('teamID', pt.StringType(), True), pt.StructField('lgID', pt.StringType(), True), pt.StructField('G', pt.IntegerType(), True), pt.StructField('AB', pt.DoubleType(), True), pt.StructField('R', pt.DoubleType(), True), pt.StructField('H', pt.DoubleType(), True), pt.StructField('X2B', pt.DoubleType(), True), pt.StructField('X3B', pt.DoubleType(), True), pt.StructField('HR', pt.DoubleType(), True), pt.StructField('RBI', pt.DoubleType(), True), pt.StructField('SB', pt.DoubleType(), True), pt.StructField('CS', pt.DoubleType(), True), pt.StructField('BB', pt.DoubleType(), True), pt.StructField('SO', pt.DoubleType(), True), pt.StructField('IBB', pt.DoubleType(), True), pt.StructField('HBP', pt.DoubleType(), True), pt.StructField('SH', pt.DoubleType(), True), pt.StructField('SF', pt.DoubleType(), True), pt.StructField('GIDP', pt.DoubleType(), True), ]), header=True, ) df_batting.createOrReplaceTempView('batting') df_awards_players = client._session.read.csv( path=str(data_directory / 'awards_players.csv'), schema=pt.StructType([ pt.StructField('playerID', pt.StringType(), True), pt.StructField('awardID', pt.StringType(), True), pt.StructField('yearID', pt.IntegerType(), True), pt.StructField('lgID', pt.StringType(), True), pt.StructField('tie', pt.StringType(), True), pt.StructField('notes', pt.StringType(), True), ]), header=True, ) df_awards_players.createOrReplaceTempView('awards_players') df_simple = client._session.createDataFrame([(1, 'a')], ['foo', 'bar']) df_simple.createOrReplaceTempView('simple') df_struct = client._session.createDataFrame([((1, 2, 'a'), )], ['struct_col']) df_struct.createOrReplaceTempView('struct') df_nested_types = client._session.createDataFrame([( [1, 2], [[3, 4], [5, 6]], { 'a': [[2, 4], [3, 5]] }, )], [ 'list_of_ints', 'list_of_list_of_ints', 'map_string_list_of_list_of_ints' ]) df_nested_types.createOrReplaceTempView('nested_types') df_complicated = client._session.createDataFrame([({ (1, 3): [[2, 4], [3, 5]] }, )], ['map_tuple_list_of_list_of_ints']) df_complicated.createOrReplaceTempView('complicated') return client
pass PRIMITIVES = [ (NoneType, t.NullType()), (int, t.IntegerType()), (float, t.DoubleType()), (str, t.StringType()), (bytes, t.BinaryType()), (bytearray, t.BinaryType()), (bool, t.BooleanType()), ] SYNTHETIC_PRIMITIVES = [ (long, t.LongType()), (short, t.ShortType()), (byte, t.ByteType()), ] DATE_TYPES = [ (date, t.DateType()), (datetime, t.TimestampType()), ] OPTIONALS_PY = [ (str, (False, str)), (decimal(1, 2), (False, decimal(1, 2))), (Optional[decimal(3, 4)], (True, decimal(3, 4))), (List[str], (False, List[str])), (Optional[str], (True, str)), (Optional[List[str]], (True, List[str])),
def infer_spark_dtype(df, col): """ Deduce correct spark dtype from pandas dtype for column col of pandas dataframe df """ logger = logging.getLogger(__name__ + ".infer_spark_dtype") pd_dtype = df.dtypes[col] # get a sample from column col sample = df[col].dropna() if sample.shape[0] == 0: logger.warning("column %s of dtype %s containing nulls found" % (col, pd_dtype)) sample = None else: sample = sample.iloc[0] # infer spark dtype # datetimes if pd.api.types.is_datetime64_any_dtype(pd_dtype): ret = T.TimestampType() # ints elif (pd_dtype == 'int8') or (pd_dtype == 'int16'): # int8, int16 ret = T.ShortType() elif pd_dtype == 'int32': ret = T.IntegerType() elif pd.api.types.is_int64_dtype(pd_dtype): ret = T.LongType() # uints elif pd_dtype == 'uint8': ret = T.ShortType() elif pd_dtype == 'uint16': ret = T.IntegerType() elif pd_dtype == 'uint32': ret = T.LongType() elif pd_dtype == 'uint64': logger.warning("converting column %s of type uint64 to spark LongType - overflows will be nulls" % col) ret = T.LongType() # floats elif (pd_dtype == 'float16') or (pd_dtype == 'float32'): ret = T.FloatType() elif pd_dtype == 'float64': # float64 ret = T.DoubleType() elif pd_dtype == 'bool': ret = T.BooleanType() # object elif pd_dtype == 'object': if (sample is None) or (isinstance(sample, str)): logger.warning("converting column %s of type object to spark StringType" % col) ret = T.StringType() elif isinstance(sample, tuple): raise NotImplementedError("cannot convert column %s containing tuples to spark" % col) else: raise NotImplementedError("values in column %s of type object not understood" % col) # category elif pd.api.types.is_categorical_dtype(pd_dtype): logger.warning("converting column %s of type category to spark StringType" % col) ret = T.StringType() else: raise NotImplementedError("column %s of type %s not understood" % (col, pd_dtype)) return ret
def get_common_spark_testing_client(data_directory, connect): pytest.importorskip('pyspark') import pyspark.sql.types as pt from pyspark.sql import SparkSession spark = (SparkSession.builder.config('spark.default.parallelism', 4).config('spark.driver.bindAddress', '127.0.0.1').getOrCreate()) _spark_testing_client = connect(spark) s = _spark_testing_client._session num_partitions = 4 df_functional_alltypes = ( s.read.csv( path=str(data_directory / 'functional_alltypes.csv'), schema=pt.StructType([ pt.StructField('index', pt.IntegerType(), True), pt.StructField('Unnamed: 0', pt.IntegerType(), True), pt.StructField('id', pt.IntegerType(), True), # cast below, Spark can't read 0/1 as bool pt.StructField('bool_col', pt.ByteType(), True), pt.StructField('tinyint_col', pt.ByteType(), True), pt.StructField('smallint_col', pt.ShortType(), True), pt.StructField('int_col', pt.IntegerType(), True), pt.StructField('bigint_col', pt.LongType(), True), pt.StructField('float_col', pt.FloatType(), True), pt.StructField('double_col', pt.DoubleType(), True), pt.StructField('date_string_col', pt.StringType(), True), pt.StructField('string_col', pt.StringType(), True), pt.StructField('timestamp_col', pt.TimestampType(), True), pt.StructField('year', pt.IntegerType(), True), pt.StructField('month', pt.IntegerType(), True), ]), mode='FAILFAST', header=True, ).repartition(num_partitions).sort('index')) df_functional_alltypes = df_functional_alltypes.withColumn( "bool_col", df_functional_alltypes["bool_col"].cast("boolean")) df_functional_alltypes.createOrReplaceTempView('functional_alltypes') df_batting = (s.read.csv( path=str(data_directory / 'batting.csv'), schema=pt.StructType([ pt.StructField('playerID', pt.StringType(), True), pt.StructField('yearID', pt.IntegerType(), True), pt.StructField('stint', pt.IntegerType(), True), pt.StructField('teamID', pt.StringType(), True), pt.StructField('lgID', pt.StringType(), True), pt.StructField('G', pt.IntegerType(), True), pt.StructField('AB', pt.DoubleType(), True), pt.StructField('R', pt.DoubleType(), True), pt.StructField('H', pt.DoubleType(), True), pt.StructField('X2B', pt.DoubleType(), True), pt.StructField('X3B', pt.DoubleType(), True), pt.StructField('HR', pt.DoubleType(), True), pt.StructField('RBI', pt.DoubleType(), True), pt.StructField('SB', pt.DoubleType(), True), pt.StructField('CS', pt.DoubleType(), True), pt.StructField('BB', pt.DoubleType(), True), pt.StructField('SO', pt.DoubleType(), True), pt.StructField('IBB', pt.DoubleType(), True), pt.StructField('HBP', pt.DoubleType(), True), pt.StructField('SH', pt.DoubleType(), True), pt.StructField('SF', pt.DoubleType(), True), pt.StructField('GIDP', pt.DoubleType(), True), ]), header=True, ).repartition(num_partitions).sort('playerID')) df_batting.createOrReplaceTempView('batting') df_awards_players = (s.read.csv( path=str(data_directory / 'awards_players.csv'), schema=pt.StructType([ pt.StructField('playerID', pt.StringType(), True), pt.StructField('awardID', pt.StringType(), True), pt.StructField('yearID', pt.IntegerType(), True), pt.StructField('lgID', pt.StringType(), True), pt.StructField('tie', pt.StringType(), True), pt.StructField('notes', pt.StringType(), True), ]), header=True, ).repartition(num_partitions).sort('playerID')) df_awards_players.createOrReplaceTempView('awards_players') df_simple = s.createDataFrame([(1, 'a')], ['foo', 'bar']) df_simple.createOrReplaceTempView('simple') df_struct = s.createDataFrame([((1, 2, 'a'), )], ['struct_col']) df_struct.createOrReplaceTempView('struct') df_nested_types = s.createDataFrame( [([1, 2], [[3, 4], [5, 6]], { 'a': [[2, 4], [3, 5]] })], [ 'list_of_ints', 'list_of_list_of_ints', 'map_string_list_of_list_of_ints', ], ) df_nested_types.createOrReplaceTempView('nested_types') df_complicated = s.createDataFrame([({ (1, 3): [[2, 4], [3, 5]] }, )], ['map_tuple_list_of_list_of_ints']) df_complicated.createOrReplaceTempView('complicated') df_udf = s.createDataFrame( [('a', 1, 4.0, 'a'), ('b', 2, 5.0, 'a'), ('c', 3, 6.0, 'b')], ['a', 'b', 'c', 'key'], ) df_udf.createOrReplaceTempView('udf') df_udf_nan = s.createDataFrame( pd.DataFrame({ 'a': np.arange(10, dtype=float), 'b': [3.0, np.NaN] * 5, 'key': list('ddeefffggh'), })) df_udf_nan.createOrReplaceTempView('udf_nan') df_udf_null = s.createDataFrame( [(float(i), None if i % 2 else 3.0, 'ddeefffggh'[i]) for i in range(10)], ['a', 'b', 'key'], ) df_udf_null.createOrReplaceTempView('udf_null') df_udf_random = s.createDataFrame( pd.DataFrame({ 'a': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), 'b': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), 'key': list('ddeefff'), })) df_udf_random.createOrReplaceTempView('udf_random') return _spark_testing_client
.master("yarn")\ .getOrCreate() spark_session.sparkContext.addFile('parse_tool.py') from parse_tool import parse_logs # User logs collection user_logs = spark_session.sparkContext.textFile("/data/access_logs/big_log/") parsed_logs = user_logs.map(parse_logs) \ .filter(lambda parse_res : parse_res[8] != "") \ .map(lambda parse_res : [ parse_res[0], # ip parse_res[0] + '_' + parse_res[7], # ip+user_agent parse_res[8] # hour ]) schema = tp.StructType().add("ip", tp.StringType()) \ .add("user_id", tp.StringType()) \ .add("hour", tp.ShortType()) user_log_df = spark_session.createDataFrame(parsed_logs, schema) res = user_log_df.groupby("hour") \ .agg(fn.countDistinct("user_id").alias("unique_visitors_count")) \ .orderBy("hour") \ .collect() for hour, count in res: print(hour, count, sep='\t')
else: parameters = getattr(tuple_type, "__args__") return _DataFrame([as_spark_type(t) for t in parameters]) inner = as_spark_type(tpe) if inner is None: return _Unknown(tpe) else: return _Scalar(inner) # First element of the list is the python base type _base = { types.StringType(): [str, "str", "string"], types.BinaryType(): [bytes], types.ByteType(): [np.int8, "int8", "byte"], types.ShortType(): [np.int16, "int16", "short"], types.IntegerType(): [int, "int", np.int, np.int32], types.LongType(): [np.int64, "int64", "long", "bigint"], types.FloatType(): [float, "float", np.float], types.DoubleType(): [np.float64, "float64", "double"], types.TimestampType(): [datetime.datetime, np.datetime64], types.DateType(): [datetime.date], types.BooleanType(): [bool, "boolean", "bool", np.bool], types.ArrayType(types.StringType()): [], } def _build_type_dict(): return dict([(other_type, spark_type) for (spark_type, l) in _base.items() for other_type in l] + [(spark_type, spark_type) for (spark_type, _) in _base.items()])
def as_spark_type(tpe: typing.Union[str, type, Dtype], *, raise_error: bool = True) -> types.DataType: """ Given a Python type, returns the equivalent spark type. Accepts: - the built-in types in Python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - Python3's typing system """ # TODO: Add "boolean" and "string" types. # ArrayType if tpe in (np.ndarray, ): return types.ArrayType(types.StringType()) elif hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, list): # type: ignore element_type = as_spark_type(tpe.__args__[0], raise_error=raise_error) # type: ignore if element_type is None: return None return types.ArrayType(element_type) # BinaryType elif tpe in (bytes, np.character, np.bytes_, np.string_): return types.BinaryType() # BooleanType elif tpe in (bool, np.bool, "bool", "?"): return types.BooleanType() # DateType elif tpe in (datetime.date, ): return types.DateType() # NumericType elif tpe in (np.int8, np.byte, "int8", "byte", "b"): return types.ByteType() elif tpe in (decimal.Decimal, ): # TODO: considering about the precision & scale for decimal type. return types.DecimalType(38, 18) elif tpe in (float, np.float, np.float64, "float", "float64", "double"): return types.DoubleType() elif tpe in (np.float32, "float32", "f"): return types.FloatType() elif tpe in (np.int32, "int32", "i"): return types.IntegerType() elif tpe in (int, np.int, np.int64, "int", "int64", "long"): return types.LongType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() # StringType elif tpe in (str, np.unicode_, "str", "U"): return types.StringType() # TimestampType elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"): return types.TimestampType() # categorical types elif isinstance(tpe, CategoricalDtype) or (isinstance(tpe, str) and type == "category"): return types.LongType() # extension types elif extension_dtypes_available: # IntegralType if isinstance(tpe, Int8Dtype) or (isinstance(tpe, str) and tpe == "Int8"): return types.ByteType() elif isinstance(tpe, Int16Dtype) or (isinstance(tpe, str) and tpe == "Int16"): return types.ShortType() elif isinstance(tpe, Int32Dtype) or (isinstance(tpe, str) and tpe == "Int32"): return types.IntegerType() elif isinstance(tpe, Int64Dtype) or (isinstance(tpe, str) and tpe == "Int64"): return types.LongType() if extension_object_dtypes_available: # BooleanType if isinstance(tpe, BooleanDtype) or (isinstance(tpe, str) and tpe == "boolean"): return types.BooleanType() # StringType elif isinstance(tpe, StringDtype) or (isinstance(tpe, str) and tpe == "string"): return types.StringType() if extension_float_dtypes_available: # FractionalType if isinstance(tpe, Float32Dtype) or (isinstance(tpe, str) and tpe == "Float32"): return types.FloatType() elif isinstance(tpe, Float64Dtype) or (isinstance(tpe, str) and tpe == "Float64"): return types.DoubleType() if raise_error: raise TypeError("Type %s was not understood." % tpe) else: return None
def process_song_data(spark, input_data, output_data): """ Description: This function can be used to process song-data files from the given input path and transform the data from json files into songs and artists spark tables and writing these tables to the given output path as parquet tables. Arguments: spark: SparkSession object. input_data: Path to the input JSON files. output_data: Path to the output directory that stores output parquet tables. Returns: df: Song data dataframe. """ # get filepath to song data file song_data = input_data + 'song-data/A/B/*/*.json' # define schema for song data file song_schema = t.StructType([ t.StructField("artist_id", t.StringType(), True), t.StructField("artist_latitude", t.DecimalType(11, 7), True), t.StructField("artist_location", t.StringType(), True), t.StructField("artist_longitude", t.DecimalType(11, 7), True), t.StructField("artist_name", t.StringType(), True), t.StructField("duration", t.DecimalType(11, 7), True), t.StructField("num_songs", t.IntegerType(), True), t.StructField("song_id", t.StringType(), True), t.StructField("title", t.StringType(), True), t.StructField("year", t.ShortType(), True) ]) # read song data file using schema df = spark \ .read \ .format("json") \ .schema(song_schema) \ .load(song_data) # extract columns to create songs table songs_table = df \ .select(['song_id', 'title', 'artist_id', 'year', 'duration']) \ .dropDuplicates() # write songs table to parquet files partitioned by year and artist songs_output = output_data + 'songs' songs_table \ .write \ .partitionBy('year', 'artist_id') \ .option("path", songs_output) \ .saveAsTable('songs', format='parquet') # extract columns to create artists table artists_table = df \ .select(['artist_id', 'artist_name', 'artist_location', 'artist_longitude', 'artist_latitude']) \ .dropDuplicates() # write artists table to parquet files artists_output = output_data + 'artists' artists_table \ .write \ .option("path", artists_output) \ .saveAsTable('artists', format='parquet') return df