def check_hist_plot(kser): bins = np.array([ 1.0, 5.9, 10.8, 15.7, 20.6, 25.5, 30.4, 35.3, 40.2, 45.1, 50.0 ]) data = np.array([5.0, 4.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]) prev = bins[0] text_bins = [] for b in bins[1:]: text_bins.append("[%s, %s)" % (prev, b)) prev = b text_bins[-1] = text_bins[-1][:-1] + "]" bins = 0.5 * (bins[:-1] + bins[1:]) name_a = name_like_string(kser.name) bars = [ go.Bar( x=bins, y=data, name=name_a, text=text_bins, hovertemplate=("variable=" + name_a + "<br>value=%{text}<br>count=%{y}"), ), ] fig = go.Figure(data=bars, layout=go.Layout(barmode="stack")) fig["layout"]["xaxis"]["title"] = "value" fig["layout"]["yaxis"]["title"] = "count" self.assertEqual(pprint.pformat(kser.plot(kind="hist").to_dict()), pprint.pformat(fig.to_dict()))
def __init__(self, dtypes: List[Dtype], spark_types: List[types.DataType], names: List[Optional[str]]): from pyspark.pandas.utils import name_like_string self.dtypes = dtypes self.spark_type = types.StructType([ types.StructField( name_like_string(n) if n is not None else ("c%s" % i), t) for i, (n, t) in enumerate(zip(names, spark_types)) ]) # type: types.StructType
def check_pox_plot(kser): fig = go.Figure() fig.add_trace( go.Box( name=name_like_string(kser.name), q1=[3], median=[6], q3=[9], mean=[10.0], lowerfence=[1], upperfence=[15], y=[[50]], boxpoints="suspectedoutliers", notched=False, )) fig["layout"]["xaxis"]["title"] = name_like_string(kser.name) fig["layout"]["yaxis"]["title"] = "value" self.assertEqual(pprint.pformat(kser.plot(kind="box").to_dict()), pprint.pformat(fig.to_dict()))
def __init__( self, dtypes: List[Dtype], spark_types: List[types.DataType], names: List[Optional[str]] ): from pyspark.pandas.internal import InternalField from pyspark.pandas.utils import name_like_string self.fields = [ InternalField( dtype=dtype, struct_field=types.StructField( name=(name_like_string(name) if name is not None else ("c%s" % i)), dataType=spark_type, ), ) for i, (name, dtype, spark_type) in enumerate(zip(names, dtypes, spark_types)) ]
def infer_return_type( f: Callable ) -> Union[SeriesType, DataFrameType, ScalarType, UnknownType]: """ Infer the return type from the return type annotation of the given function. The returned type class indicates both dtypes (a pandas only dtype object or a numpy dtype object) and its corresponding Spark DataType. >>> def func() -> int: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType >>> def func() -> ps.Series[int]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType >>> def func() -> ps.DataFrame[np.float, str]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('<U')] >>> inferred.spark_type StructType(List(StructField(c0,DoubleType,true),StructField(c1,StringType,true))) >>> def func() -> ps.DataFrame[np.float]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64')] >>> inferred.spark_type StructType(List(StructField(c0,DoubleType,true))) >>> def func() -> 'int': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType >>> def func() -> 'ps.Series[int]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType >>> def func() -> 'ps.DataFrame[np.float, str]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('<U')] >>> inferred.spark_type StructType(List(StructField(c0,DoubleType,true),StructField(c1,StringType,true))) >>> def func() -> 'ps.DataFrame[np.float]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64')] >>> inferred.spark_type StructType(List(StructField(c0,DoubleType,true))) >>> def func() -> ps.DataFrame['a': np.float, 'b': int]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('int64')] >>> inferred.spark_type StructType(List(StructField(a,DoubleType,true),StructField(b,LongType,true))) >>> def func() -> "ps.DataFrame['a': np.float, 'b': int]": ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('int64')] >>> inferred.spark_type StructType(List(StructField(a,DoubleType,true),StructField(b,LongType,true))) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) >>> def func() -> ps.DataFrame[pdf.dtypes]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type StructType(List(StructField(c0,LongType,true),StructField(c1,LongType,true))) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type StructType(List(StructField(a,LongType,true),StructField(b,LongType,true))) >>> pdf = pd.DataFrame({("x", "a"): [1, 2, 3], ("y", "b"): [3, 4, 5]}) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type StructType(List(StructField((x, a),LongType,true),StructField((y, b),LongType,true))) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": pd.Categorical([3, 4, 5])}) >>> def func() -> ps.DataFrame[pdf.dtypes]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type StructType(List(StructField(c0,LongType,true),StructField(c1,LongType,true))) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type StructType(List(StructField(a,LongType,true),StructField(b,LongType,true))) >>> def func() -> ps.Series[pdf.b.dtype]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype CategoricalDtype(categories=[3, 4, 5], ordered=False) >>> inferred.spark_type LongType >>> def func() -> ps.DataFrame[int, [int, int]]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64'), dtype('int64')] >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>' >>> inferred.index_fields [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))] >>> def func() -> ps.DataFrame[pdf.index.dtype, pdf.dtypes]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>' >>> inferred.index_fields [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))] >>> def func() -> ps.DataFrame[ ... ("index", CategoricalDtype(categories=[3, 4, 5], ordered=False)), ... [("id", int), ("A", int)]]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [CategoricalDtype(categories=[3, 4, 5], ordered=False), dtype('int64'), dtype('int64')] >>> inferred.spark_type.simpleString() 'struct<index:bigint,id:bigint,A:bigint>' >>> inferred.index_fields [InternalField(dtype=category,struct_field=StructField(index,LongType,true))] >>> def func() -> ps.DataFrame[ ... (pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,a:bigint,b:bigint>' >>> inferred.index_fields [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))] """ # We should re-import to make sure the class 'SeriesType' is not treated as a class # within this module locally. See Series.__class_getitem__ which imports this class # canonically. from pyspark.pandas.internal import InternalField, SPARK_INDEX_NAME_FORMAT from pyspark.pandas.typedef import SeriesType, NameTypeHolder, IndexNameTypeHolder from pyspark.pandas.utils import name_like_string spec = getfullargspec(f) tpe = spec.annotations.get("return", None) if isinstance(tpe, str): # This type hint can happen when given hints are string to avoid forward reference. tpe = resolve_string_type_hint(tpe) if hasattr(tpe, "__origin__") and (tpe.__origin__ == ps.DataFrame or tpe.__origin__ == ps.Series): # When Python version is lower then 3.7. Unwrap it to a Tuple/SeriesType type hints. tpe = tpe.__args__[0] if hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, SeriesType): tpe = tpe.__args__[0] if issubclass(tpe, NameTypeHolder): tpe = tpe.tpe dtype, spark_type = pandas_on_spark_type(tpe) return SeriesType(dtype, spark_type) # Note that, DataFrame type hints will create a Tuple. # Python 3.6 has `__name__`. Python 3.7 and 3.8 have `_name`. # Check if the name is Tuple. name = getattr(tpe, "_name", getattr(tpe, "__name__", None)) if name == "Tuple": tuple_type = tpe if hasattr(tuple_type, "__tuple_params__"): # Python 3.5.0 to 3.5.2 has '__tuple_params__' instead. # See https://github.com/python/cpython/blob/v3.5.2/Lib/typing.py parameters = getattr(tuple_type, "__tuple_params__") else: parameters = getattr(tuple_type, "__args__") index_parameters = [ p for p in parameters if isclass(p) and issubclass(p, IndexNameTypeHolder) ] data_parameters = [p for p in parameters if p not in index_parameters] assert len( data_parameters) > 0, "Type hints for data must not be empty." index_fields = [] if len(index_parameters) >= 1: for level, index_parameter in enumerate(index_parameters): index_name = index_parameter.name index_dtype, index_spark_type = pandas_on_spark_type( index_parameter.tpe) index_fields.append( InternalField( dtype=index_dtype, struct_field=types.StructField( name=index_name if index_name is not None else SPARK_INDEX_NAME_FORMAT(level), dataType=index_spark_type, ), )) else: # No type hint for index. assert len(index_parameters) == 0 data_dtypes, data_spark_types = zip( *(pandas_on_spark_type(p.tpe) if isclass(p) and issubclass(p, NameTypeHolder) else pandas_on_spark_type(p) for p in data_parameters)) data_names = [ p.name if isclass(p) and issubclass(p, NameTypeHolder) else None for p in data_parameters ] data_fields = [] for i, (data_name, data_dtype, data_spark_type) in enumerate( zip(data_names, data_dtypes, data_spark_types)): data_fields.append( InternalField( dtype=data_dtype, struct_field=types.StructField( name=name_like_string(data_name) if data_name is not None else ("c%s" % i), dataType=data_spark_type, ), )) return DataFrameType(index_fields=index_fields, data_fields=data_fields) tpes = pandas_on_spark_type(tpe) if tpes is None: return UnknownType(tpe) else: return ScalarType(*tpes)
def compute_hist(psdf, bins): # 'data' is a Spark DataFrame that selects one column. assert isinstance(bins, (np.ndarray, np.generic)) sdf = psdf._internal.spark_frame scols = [] input_column_names = [] for label in psdf._internal.column_labels: input_column_name = name_like_string(label) input_column_names.append(input_column_name) scols.append( psdf._internal.spark_column_for(label).alias( input_column_name)) sdf = sdf.select(*scols) # 1. Make the bucket output flat to: # +----------+-------+ # |__group_id|buckets| # +----------+-------+ # |0 |0.0 | # |0 |0.0 | # |0 |1.0 | # |0 |2.0 | # |0 |3.0 | # |0 |3.0 | # |1 |0.0 | # |1 |1.0 | # |1 |1.0 | # |1 |2.0 | # |1 |1.0 | # |1 |0.0 | # +----------+-------+ colnames = sdf.columns bucket_names = ["__{}_bucket".format(colname) for colname in colnames] output_df = None for group_id, (colname, bucket_name) in enumerate(zip(colnames, bucket_names)): # creates a Bucketizer to get corresponding bin of each value bucketizer = Bucketizer(splits=bins, inputCol=colname, outputCol=bucket_name, handleInvalid="skip") bucket_df = bucketizer.transform(sdf) if output_df is None: output_df = bucket_df.select( F.lit(group_id).alias("__group_id"), F.col(bucket_name).alias("__bucket")) else: output_df = output_df.union( bucket_df.select( F.lit(group_id).alias("__group_id"), F.col(bucket_name).alias("__bucket"))) # 2. Calculate the count based on each group and bucket. # +----------+-------+------+ # |__group_id|buckets| count| # +----------+-------+------+ # |0 |0.0 |2 | # |0 |1.0 |1 | # |0 |2.0 |1 | # |0 |3.0 |2 | # |1 |0.0 |2 | # |1 |1.0 |3 | # |1 |2.0 |1 | # +----------+-------+------+ result = (output_df.groupby("__group_id", "__bucket").agg( F.count("*").alias("count")).toPandas().sort_values( by=["__group_id", "__bucket"])) # 3. Fill empty bins and calculate based on each group id. From: # +----------+--------+------+ # |__group_id|__bucket| count| # +----------+--------+------+ # |0 |0.0 |2 | # |0 |1.0 |1 | # |0 |2.0 |1 | # |0 |3.0 |2 | # +----------+--------+------+ # +----------+--------+------+ # |__group_id|__bucket| count| # +----------+--------+------+ # |1 |0.0 |2 | # |1 |1.0 |3 | # |1 |2.0 |1 | # +----------+--------+------+ # # to: # +-----------------+ # |__values1__bucket| # +-----------------+ # |2 | # |1 | # |1 | # |2 | # |0 | # +-----------------+ # +-----------------+ # |__values2__bucket| # +-----------------+ # |2 | # |3 | # |1 | # |0 | # |0 | # +-----------------+ output_series = [] for i, (input_column_name, bucket_name) in enumerate(zip(input_column_names, bucket_names)): current_bucket_result = result[result["__group_id"] == i] # generates a pandas DF with one row for each bin # we need this as some of the bins may be empty indexes = pd.DataFrame({"__bucket": np.arange(0, len(bins) - 1)}) # merges the bins with counts on it and fills remaining ones with zeros pdf = indexes.merge(current_bucket_result, how="left", on=["__bucket"]).fillna(0)[["count"]] pdf.columns = [input_column_name] output_series.append(pdf[input_column_name]) return output_series
def attach_id_column(self, id_type: str, column: Name) -> "DataFrame": """ Attach a column to be used as identifier of rows similar to the default index. See also `Default Index type <https://koalas.readthedocs.io/en/latest/user_guide/options.html#default-index-type>`_. Parameters ---------- id_type : string The id type. - 'sequence' : a sequence that increases one by one. .. note:: this uses Spark's Window without specifying partition specification. This leads to move all data into single partition in single machine and could cause serious performance degradation. Avoid this method against very large dataset. - 'distributed-sequence' : a sequence that increases one by one, by group-by and group-map approach in a distributed manner. - 'distributed' : a monotonically increasing sequence simply by using PySpark’s monotonically_increasing_id function in a fully distributed manner. column : string or tuple of string The column name. Returns ------- DataFrame The DataFrame attached the column. Examples -------- >>> df = ps.DataFrame({"x": ['a', 'b', 'c']}) >>> df.pandas_on_spark.attach_id_column(id_type="sequence", column="id") x id 0 a 0 1 b 1 2 c 2 >>> df.pandas_on_spark.attach_id_column(id_type="distributed-sequence", column=0) x 0 0 a 0 1 b 1 2 c 2 >>> df.pandas_on_spark.attach_id_column(id_type="distributed", column=0.0) ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE x 0.0 0 a ... 1 b ... 2 c ... For multi-index columns: >>> df = ps.DataFrame({("x", "y"): ['a', 'b', 'c']}) >>> df.pandas_on_spark.attach_id_column(id_type="sequence", column=("id-x", "id-y")) x id-x y id-y 0 a 0 1 b 1 2 c 2 >>> df.pandas_on_spark.attach_id_column(id_type="distributed-sequence", column=(0, 1.0)) x 0 y 1.0 0 a 0 1 b 1 2 c 2 """ from pyspark.pandas.frame import DataFrame if id_type == "sequence": attach_func = InternalFrame.attach_sequence_column elif id_type == "distributed-sequence": attach_func = InternalFrame.attach_distributed_sequence_column elif id_type == "distributed": attach_func = InternalFrame.attach_distributed_column else: raise ValueError( "id_type should be one of 'sequence', 'distributed-sequence' and 'distributed'" ) assert is_name_like_value(column, allow_none=False), column if not is_name_like_tuple(column): column = (column,) internal = self._psdf._internal if len(column) != internal.column_labels_level: raise ValueError( "The given column `{}` must be the same length as the existing columns.".format( column ) ) elif column in internal.column_labels: raise ValueError( "The given column `{}` already exists.".format(name_like_string(column)) ) # Make sure the underlying Spark column names are the form of # `name_like_string(column_label)`. sdf = internal.spark_frame.select( [ scol.alias(SPARK_INDEX_NAME_FORMAT(i)) for i, scol in enumerate(internal.index_spark_columns) ] + [ scol.alias(name_like_string(label)) for scol, label in zip(internal.data_spark_columns, internal.column_labels) ] ) sdf = attach_func(sdf, name_like_string(column)) return DataFrame( InternalFrame( spark_frame=sdf, index_spark_columns=[ scol_for(sdf, SPARK_INDEX_NAME_FORMAT(i)) for i in range(internal.index_level) ], index_names=internal.index_names, index_fields=internal.index_fields, column_labels=internal.column_labels + [column], data_spark_columns=( [scol_for(sdf, name_like_string(label)) for label in internal.column_labels] + [scol_for(sdf, name_like_string(column))] ), data_fields=internal.data_fields + [ InternalField.from_struct_field( StructField(name_like_string(column), LongType(), nullable=False) ) ], column_label_names=internal.column_label_names, ).resolved_copy )
def drop(self, codes: List[Any], level: Optional[Union[int, Name]] = None) -> "MultiIndex": """ Make new MultiIndex with passed list of labels deleted Parameters ---------- codes : array-like Must be a list of tuples level : int or level name, default None Returns ------- dropped : MultiIndex Examples -------- >>> index = ps.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')]) >>> index # doctest: +SKIP MultiIndex([('a', 'x'), ('b', 'y'), ('c', 'z')], ) >>> index.drop(['a']) # doctest: +SKIP MultiIndex([('b', 'y'), ('c', 'z')], ) >>> index.drop(['x', 'y'], level=1) # doctest: +SKIP MultiIndex([('c', 'z')], ) """ internal = self._internal.resolved_copy sdf = internal.spark_frame index_scols = internal.index_spark_columns if level is None: scol = index_scols[0] elif isinstance(level, int): scol = index_scols[level] else: scol = None for index_spark_column, index_name in zip( internal.index_spark_columns, internal.index_names): if not isinstance(level, tuple): level = (level, ) if level == index_name: if scol is not None: raise ValueError( "The name {} occurs multiple times, use a level number" .format(name_like_string(level))) scol = index_spark_column if scol is None: raise KeyError("Level {} not found".format( name_like_string(level))) sdf = sdf[~scol.isin(codes)] internal = InternalFrame( spark_frame=sdf, index_spark_columns=[ scol_for(sdf, col) for col in internal.index_spark_column_names ], index_names=internal.index_names, index_fields=internal.index_fields, column_labels=[], data_spark_columns=[], data_fields=[], ) return cast(MultiIndex, DataFrame(internal).index)
def frame(self, index_col: Optional[Union[str, List[str]]] = None) -> SparkDataFrame: """ Return the current DataFrame as a Spark DataFrame. :meth:`DataFrame.spark.frame` is an alias of :meth:`DataFrame.to_spark`. Parameters ---------- index_col: str or list of str, optional, default: None Column names to be used in Spark to represent pandas-on-Spark's index. The index name in pandas-on-Spark is ignored. By default, the index is always lost. See Also -------- DataFrame.to_spark DataFrame.to_koalas DataFrame.spark.frame Examples -------- By default, this method loses the index as below. >>> df = ps.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) >>> df.to_spark().show() # doctest: +NORMALIZE_WHITESPACE +---+---+---+ | a| b| c| +---+---+---+ | 1| 4| 7| | 2| 5| 8| | 3| 6| 9| +---+---+---+ >>> df = ps.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) >>> df.spark.frame().show() # doctest: +NORMALIZE_WHITESPACE +---+---+---+ | a| b| c| +---+---+---+ | 1| 4| 7| | 2| 5| 8| | 3| 6| 9| +---+---+---+ If `index_col` is set, it keeps the index column as specified. >>> df.to_spark(index_col="index").show() # doctest: +NORMALIZE_WHITESPACE +-----+---+---+---+ |index| a| b| c| +-----+---+---+---+ | 0| 1| 4| 7| | 1| 2| 5| 8| | 2| 3| 6| 9| +-----+---+---+---+ Keeping index column is useful when you want to call some Spark APIs and convert it back to pandas-on-Spark DataFrame without creating a default index, which can affect performance. >>> spark_df = df.to_spark(index_col="index") >>> spark_df = spark_df.filter("a == 2") >>> spark_df.to_koalas(index_col="index") # doctest: +NORMALIZE_WHITESPACE a b c index 1 2 5 8 In case of multi-index, specify a list to `index_col`. >>> new_df = df.set_index("a", append=True) >>> new_spark_df = new_df.to_spark(index_col=["index_1", "index_2"]) >>> new_spark_df.show() # doctest: +NORMALIZE_WHITESPACE +-------+-------+---+---+ |index_1|index_2| b| c| +-------+-------+---+---+ | 0| 1| 4| 7| | 1| 2| 5| 8| | 2| 3| 6| 9| +-------+-------+---+---+ Likewise, can be converted to back to pandas-on-Spark DataFrame. >>> new_spark_df.to_koalas( ... index_col=["index_1", "index_2"]) # doctest: +NORMALIZE_WHITESPACE b c index_1 index_2 0 1 4 7 1 2 5 8 2 3 6 9 """ from pyspark.pandas.utils import name_like_string kdf = self._kdf data_column_names = [] data_columns = [] for i, (label, spark_column, column_name) in enumerate( zip( kdf._internal.column_labels, kdf._internal.data_spark_columns, kdf._internal.data_spark_column_names, ) ): name = str(i) if label is None else name_like_string(label) data_column_names.append(name) if column_name != name: spark_column = spark_column.alias(name) data_columns.append(spark_column) if index_col is None: return kdf._internal.spark_frame.select(data_columns) else: if isinstance(index_col, str): index_col = [index_col] old_index_scols = kdf._internal.index_spark_columns if len(index_col) != len(old_index_scols): raise ValueError( "length of index columns is %s; however, the length of the given " "'index_col' is %s." % (len(old_index_scols), len(index_col)) ) if any(col in data_column_names for col in index_col): raise ValueError("'index_col' cannot be overlapped with other columns.") new_index_scols = [ index_scol.alias(col) for index_scol, col in zip(old_index_scols, index_col) ] return kdf._internal.spark_frame.select(new_index_scols + data_columns)
def infer_return_type( f: Callable ) -> Union[SeriesType, DataFrameType, ScalarType, UnknownType]: """ Infer the return type from the return type annotation of the given function. The returned type class indicates both dtypes (a pandas only dtype object or a numpy dtype object) and its corresponding Spark DataType. >>> def func() -> int: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType() >>> def func() -> ps.Series[int]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType() >>> def func() -> ps.DataFrame[np.float, str]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('<U')] >>> inferred.spark_type StructType([StructField('c0', DoubleType(), True), StructField('c1', StringType(), True)]) >>> def func() -> ps.DataFrame[np.float]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64')] >>> inferred.spark_type StructType([StructField('c0', DoubleType(), True)]) >>> def func() -> 'int': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType() >>> def func() -> 'ps.Series[int]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType() >>> def func() -> 'ps.DataFrame[np.float, str]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('<U')] >>> inferred.spark_type StructType([StructField('c0', DoubleType(), True), StructField('c1', StringType(), True)]) >>> def func() -> 'ps.DataFrame[np.float]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64')] >>> inferred.spark_type StructType([StructField('c0', DoubleType(), True)]) >>> def func() -> ps.DataFrame['a': np.float, 'b': int]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('int64')] >>> inferred.spark_type StructType([StructField('a', DoubleType(), True), StructField('b', LongType(), True)]) >>> def func() -> "ps.DataFrame['a': np.float, 'b': int]": ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('int64')] >>> inferred.spark_type StructType([StructField('a', DoubleType(), True), StructField('b', LongType(), True)]) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) >>> def func() -> ps.DataFrame[pdf.dtypes]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type StructType([StructField('c0', LongType(), True), StructField('c1', LongType(), True)]) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type StructType([StructField('a', LongType(), True), StructField('b', LongType(), True)]) >>> pdf = pd.DataFrame({("x", "a"): [1, 2, 3], ("y", "b"): [3, 4, 5]}) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type StructType([StructField('(x, a)', LongType(), True), StructField('(y, b)', LongType(), True)]) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": pd.Categorical([3, 4, 5])}) >>> def func() -> ps.DataFrame[pdf.dtypes]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type StructType([StructField('c0', LongType(), True), StructField('c1', LongType(), True)]) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type StructType([StructField('a', LongType(), True), StructField('b', LongType(), True)]) >>> def func() -> ps.Series[pdf.b.dtype]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype CategoricalDtype(categories=[3, 4, 5], ordered=False) >>> inferred.spark_type LongType() >>> def func() -> ps.DataFrame[int, [int, int]]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64'), dtype('int64')] >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>' >>> inferred.index_fields [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), True))] >>> def func() -> ps.DataFrame[pdf.index.dtype, pdf.dtypes]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>' >>> inferred.index_fields [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), True))] >>> def func() -> ps.DataFrame[ ... ("index", CategoricalDtype(categories=[3, 4, 5], ordered=False)), ... [("id", int), ("A", int)]]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [CategoricalDtype(categories=[3, 4, 5], ordered=False), dtype('int64'), dtype('int64')] >>> inferred.spark_type.simpleString() 'struct<index:bigint,id:bigint,A:bigint>' >>> inferred.index_fields [InternalField(dtype=category, struct_field=StructField('index', LongType(), True))] >>> def func() -> ps.DataFrame[ ... (pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,a:bigint,b:bigint>' >>> inferred.index_fields [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), True))] """ # We should re-import to make sure the class 'SeriesType' is not treated as a class # within this module locally. See Series.__class_getitem__ which imports this class # canonically. from pyspark.pandas.internal import InternalField, SPARK_INDEX_NAME_FORMAT from pyspark.pandas.typedef import SeriesType, NameTypeHolder, IndexNameTypeHolder from pyspark.pandas.utils import name_like_string tpe = get_type_hints(f).get("return", None) if tpe is None: raise ValueError("A return value is required for the input function") if hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, SeriesType): tpe = tpe.__args__[0] if issubclass(tpe, NameTypeHolder): tpe = tpe.tpe dtype, spark_type = pandas_on_spark_type(tpe) return SeriesType(dtype, spark_type) # Note that, DataFrame type hints will create a Tuple. # Tuple has _name but other types have __name__ name = getattr(tpe, "_name", getattr(tpe, "__name__", None)) # Check if the name is Tuple. if name == "Tuple": tuple_type = tpe parameters = getattr(tuple_type, "__args__") index_parameters = [ p for p in parameters if isclass(p) and issubclass(p, IndexNameTypeHolder) ] data_parameters = [p for p in parameters if p not in index_parameters] assert len( data_parameters) > 0, "Type hints for data must not be empty." index_fields = [] if len(index_parameters) >= 1: for level, index_parameter in enumerate(index_parameters): index_name = index_parameter.name index_dtype, index_spark_type = pandas_on_spark_type( index_parameter.tpe) index_fields.append( InternalField( dtype=index_dtype, struct_field=types.StructField( name=index_name if index_name is not None else SPARK_INDEX_NAME_FORMAT(level), dataType=index_spark_type, ), )) else: # No type hint for index. assert len(index_parameters) == 0 data_dtypes, data_spark_types = zip( *(pandas_on_spark_type(p.tpe) if isclass(p) and issubclass(p, NameTypeHolder) else pandas_on_spark_type(p) for p in data_parameters)) data_names = [ p.name if isclass(p) and issubclass(p, NameTypeHolder) else None for p in data_parameters ] data_fields = [] for i, (data_name, data_dtype, data_spark_type) in enumerate( zip(data_names, data_dtypes, data_spark_types)): data_fields.append( InternalField( dtype=data_dtype, struct_field=types.StructField( name=name_like_string(data_name) if data_name is not None else ("c%s" % i), dataType=data_spark_type, ), )) return DataFrameType(index_fields=index_fields, data_fields=data_fields) tpes = pandas_on_spark_type(tpe) if tpes is None: return UnknownType(tpe) else: return ScalarType(*tpes)