def infer_return_type( f: Callable ) -> Union[SeriesType, DataFrameType, ScalarType, UnknownType]: """ Infer the return type from the return type annotation of the given function. The returned type class indicates both dtypes (a pandas only dtype object or a numpy dtype object) and its corresponding Spark DataType. >>> def func() -> int: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType >>> def func() -> ps.Series[int]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType >>> def func() -> ps.DataFrame[np.float, str]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('<U')] >>> inferred.spark_type StructType(List(StructField(c0,DoubleType,true),StructField(c1,StringType,true))) >>> def func() -> ps.DataFrame[np.float]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64')] >>> inferred.spark_type StructType(List(StructField(c0,DoubleType,true))) >>> def func() -> 'int': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType >>> def func() -> 'ps.Series[int]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType >>> def func() -> 'ps.DataFrame[np.float, str]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('<U')] >>> inferred.spark_type StructType(List(StructField(c0,DoubleType,true),StructField(c1,StringType,true))) >>> def func() -> 'ps.DataFrame[np.float]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64')] >>> inferred.spark_type StructType(List(StructField(c0,DoubleType,true))) >>> def func() -> ps.DataFrame['a': np.float, 'b': int]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('int64')] >>> inferred.spark_type StructType(List(StructField(a,DoubleType,true),StructField(b,LongType,true))) >>> def func() -> "ps.DataFrame['a': np.float, 'b': int]": ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('int64')] >>> inferred.spark_type StructType(List(StructField(a,DoubleType,true),StructField(b,LongType,true))) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) >>> def func() -> ps.DataFrame[pdf.dtypes]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type StructType(List(StructField(c0,LongType,true),StructField(c1,LongType,true))) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type StructType(List(StructField(a,LongType,true),StructField(b,LongType,true))) >>> pdf = pd.DataFrame({("x", "a"): [1, 2, 3], ("y", "b"): [3, 4, 5]}) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type StructType(List(StructField((x, a),LongType,true),StructField((y, b),LongType,true))) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": pd.Categorical([3, 4, 5])}) >>> def func() -> ps.DataFrame[pdf.dtypes]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type StructType(List(StructField(c0,LongType,true),StructField(c1,LongType,true))) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type StructType(List(StructField(a,LongType,true),StructField(b,LongType,true))) >>> def func() -> ps.Series[pdf.b.dtype]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype CategoricalDtype(categories=[3, 4, 5], ordered=False) >>> inferred.spark_type LongType >>> def func() -> ps.DataFrame[int, [int, int]]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64'), dtype('int64')] >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>' >>> inferred.index_fields [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))] >>> def func() -> ps.DataFrame[pdf.index.dtype, pdf.dtypes]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>' >>> inferred.index_fields [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))] >>> def func() -> ps.DataFrame[ ... ("index", CategoricalDtype(categories=[3, 4, 5], ordered=False)), ... [("id", int), ("A", int)]]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [CategoricalDtype(categories=[3, 4, 5], ordered=False), dtype('int64'), dtype('int64')] >>> inferred.spark_type.simpleString() 'struct<index:bigint,id:bigint,A:bigint>' >>> inferred.index_fields [InternalField(dtype=category,struct_field=StructField(index,LongType,true))] >>> def func() -> ps.DataFrame[ ... (pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,a:bigint,b:bigint>' >>> inferred.index_fields [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))] """ # We should re-import to make sure the class 'SeriesType' is not treated as a class # within this module locally. See Series.__class_getitem__ which imports this class # canonically. from pyspark.pandas.internal import InternalField, SPARK_INDEX_NAME_FORMAT from pyspark.pandas.typedef import SeriesType, NameTypeHolder, IndexNameTypeHolder from pyspark.pandas.utils import name_like_string spec = getfullargspec(f) tpe = spec.annotations.get("return", None) if isinstance(tpe, str): # This type hint can happen when given hints are string to avoid forward reference. tpe = resolve_string_type_hint(tpe) if hasattr(tpe, "__origin__") and (tpe.__origin__ == ps.DataFrame or tpe.__origin__ == ps.Series): # When Python version is lower then 3.7. Unwrap it to a Tuple/SeriesType type hints. tpe = tpe.__args__[0] if hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, SeriesType): tpe = tpe.__args__[0] if issubclass(tpe, NameTypeHolder): tpe = tpe.tpe dtype, spark_type = pandas_on_spark_type(tpe) return SeriesType(dtype, spark_type) # Note that, DataFrame type hints will create a Tuple. # Python 3.6 has `__name__`. Python 3.7 and 3.8 have `_name`. # Check if the name is Tuple. name = getattr(tpe, "_name", getattr(tpe, "__name__", None)) if name == "Tuple": tuple_type = tpe if hasattr(tuple_type, "__tuple_params__"): # Python 3.5.0 to 3.5.2 has '__tuple_params__' instead. # See https://github.com/python/cpython/blob/v3.5.2/Lib/typing.py parameters = getattr(tuple_type, "__tuple_params__") else: parameters = getattr(tuple_type, "__args__") index_parameters = [ p for p in parameters if isclass(p) and issubclass(p, IndexNameTypeHolder) ] data_parameters = [p for p in parameters if p not in index_parameters] assert len( data_parameters) > 0, "Type hints for data must not be empty." index_fields = [] if len(index_parameters) >= 1: for level, index_parameter in enumerate(index_parameters): index_name = index_parameter.name index_dtype, index_spark_type = pandas_on_spark_type( index_parameter.tpe) index_fields.append( InternalField( dtype=index_dtype, struct_field=types.StructField( name=index_name if index_name is not None else SPARK_INDEX_NAME_FORMAT(level), dataType=index_spark_type, ), )) else: # No type hint for index. assert len(index_parameters) == 0 data_dtypes, data_spark_types = zip( *(pandas_on_spark_type(p.tpe) if isclass(p) and issubclass(p, NameTypeHolder) else pandas_on_spark_type(p) for p in data_parameters)) data_names = [ p.name if isclass(p) and issubclass(p, NameTypeHolder) else None for p in data_parameters ] data_fields = [] for i, (data_name, data_dtype, data_spark_type) in enumerate( zip(data_names, data_dtypes, data_spark_types)): data_fields.append( InternalField( dtype=data_dtype, struct_field=types.StructField( name=name_like_string(data_name) if data_name is not None else ("c%s" % i), dataType=data_spark_type, ), )) return DataFrameType(index_fields=index_fields, data_fields=data_fields) tpes = pandas_on_spark_type(tpe) if tpes is None: return UnknownType(tpe) else: return ScalarType(*tpes)
def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarType, UnknownType]: """ Infer the return type from the return type annotation of the given function. The returned type class indicates both dtypes (a pandas only dtype object or a numpy dtype object) and its corresponding Spark DataType. >>> def func() -> int: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType >>> def func() -> ps.Series[int]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType >>> def func() -> ps.DataFrame[np.float, str]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('<U')] >>> inferred.spark_type StructType(List(StructField(c0,DoubleType,true),StructField(c1,StringType,true))) >>> def func() -> ps.DataFrame[np.float]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64')] >>> inferred.spark_type StructType(List(StructField(c0,DoubleType,true))) >>> def func() -> 'int': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType >>> def func() -> 'ps.Series[int]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType >>> def func() -> 'ps.DataFrame[np.float, str]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('<U')] >>> inferred.spark_type StructType(List(StructField(c0,DoubleType,true),StructField(c1,StringType,true))) >>> def func() -> 'ps.DataFrame[np.float]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64')] >>> inferred.spark_type StructType(List(StructField(c0,DoubleType,true))) >>> def func() -> ps.DataFrame['a': np.float, 'b': int]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('int64')] >>> inferred.spark_type StructType(List(StructField(a,DoubleType,true),StructField(b,LongType,true))) >>> def func() -> "ps.DataFrame['a': np.float, 'b': int]": ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('int64')] >>> inferred.spark_type StructType(List(StructField(a,DoubleType,true),StructField(b,LongType,true))) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) >>> def func() -> ps.DataFrame[pdf.dtypes]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type StructType(List(StructField(c0,LongType,true),StructField(c1,LongType,true))) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type StructType(List(StructField(a,LongType,true),StructField(b,LongType,true))) >>> pdf = pd.DataFrame({("x", "a"): [1, 2, 3], ("y", "b"): [3, 4, 5]}) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type StructType(List(StructField((x, a),LongType,true),StructField((y, b),LongType,true))) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": pd.Categorical([3, 4, 5])}) >>> def func() -> ps.DataFrame[pdf.dtypes]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type StructType(List(StructField(c0,LongType,true),StructField(c1,LongType,true))) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type StructType(List(StructField(a,LongType,true),StructField(b,LongType,true))) >>> def func() -> ps.Series[pdf.b.dtype]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype CategoricalDtype(categories=[3, 4, 5], ordered=False) >>> inferred.spark_type LongType """ # We should re-import to make sure the class 'SeriesType' is not treated as a class # within this module locally. See Series.__class_getitem__ which imports this class # canonically. from pyspark.pandas.typedef import SeriesType, NameTypeHolder spec = getfullargspec(f) tpe = spec.annotations.get("return", None) if isinstance(tpe, str): # This type hint can happen when given hints are string to avoid forward reference. tpe = resolve_string_type_hint(tpe) if hasattr(tpe, "__origin__") and ( tpe.__origin__ == ps.DataFrame or tpe.__origin__ == ps.Series ): # When Python version is lower then 3.7. Unwrap it to a Tuple/SeriesType type hints. tpe = tpe.__args__[0] if hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, SeriesType): tpe = tpe.__args__[0] if issubclass(tpe, NameTypeHolder): tpe = tpe.tpe dtype, spark_type = pandas_on_spark_type(tpe) return SeriesType(dtype, spark_type) # Note that, DataFrame type hints will create a Tuple. # Python 3.6 has `__name__`. Python 3.7 and 3.8 have `_name`. # Check if the name is Tuple. name = getattr(tpe, "_name", getattr(tpe, "__name__", None)) if name == "Tuple": tuple_type = tpe if hasattr(tuple_type, "__tuple_params__"): # Python 3.5.0 to 3.5.2 has '__tuple_params__' instead. # See https://github.com/python/cpython/blob/v3.5.2/Lib/typing.py parameters = getattr(tuple_type, "__tuple_params__") else: parameters = getattr(tuple_type, "__args__") dtypes, spark_types = zip( *( pandas_on_spark_type(p.tpe) if isclass(p) and issubclass(p, NameTypeHolder) else pandas_on_spark_type(p) for p in parameters ) ) names = [ p.name if isclass(p) and issubclass(p, NameTypeHolder) else None for p in parameters ] return DataFrameType(list(dtypes), list(spark_types), names) types = pandas_on_spark_type(tpe) if types is None: return UnknownType(tpe) else: return ScalarType(*types)
def infer_return_type( f: Callable ) -> Union[SeriesType, DataFrameType, ScalarType, UnknownType]: """ Infer the return type from the return type annotation of the given function. The returned type class indicates both dtypes (a pandas only dtype object or a numpy dtype object) and its corresponding Spark DataType. >>> def func() -> int: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType() >>> def func() -> ps.Series[int]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType() >>> def func() -> ps.DataFrame[np.float, str]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('<U')] >>> inferred.spark_type StructType([StructField('c0', DoubleType(), True), StructField('c1', StringType(), True)]) >>> def func() -> ps.DataFrame[np.float]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64')] >>> inferred.spark_type StructType([StructField('c0', DoubleType(), True)]) >>> def func() -> 'int': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType() >>> def func() -> 'ps.Series[int]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType() >>> def func() -> 'ps.DataFrame[np.float, str]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('<U')] >>> inferred.spark_type StructType([StructField('c0', DoubleType(), True), StructField('c1', StringType(), True)]) >>> def func() -> 'ps.DataFrame[np.float]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64')] >>> inferred.spark_type StructType([StructField('c0', DoubleType(), True)]) >>> def func() -> ps.DataFrame['a': np.float, 'b': int]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('int64')] >>> inferred.spark_type StructType([StructField('a', DoubleType(), True), StructField('b', LongType(), True)]) >>> def func() -> "ps.DataFrame['a': np.float, 'b': int]": ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('int64')] >>> inferred.spark_type StructType([StructField('a', DoubleType(), True), StructField('b', LongType(), True)]) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) >>> def func() -> ps.DataFrame[pdf.dtypes]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type StructType([StructField('c0', LongType(), True), StructField('c1', LongType(), True)]) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type StructType([StructField('a', LongType(), True), StructField('b', LongType(), True)]) >>> pdf = pd.DataFrame({("x", "a"): [1, 2, 3], ("y", "b"): [3, 4, 5]}) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type StructType([StructField('(x, a)', LongType(), True), StructField('(y, b)', LongType(), True)]) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": pd.Categorical([3, 4, 5])}) >>> def func() -> ps.DataFrame[pdf.dtypes]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type StructType([StructField('c0', LongType(), True), StructField('c1', LongType(), True)]) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type StructType([StructField('a', LongType(), True), StructField('b', LongType(), True)]) >>> def func() -> ps.Series[pdf.b.dtype]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype CategoricalDtype(categories=[3, 4, 5], ordered=False) >>> inferred.spark_type LongType() >>> def func() -> ps.DataFrame[int, [int, int]]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64'), dtype('int64')] >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>' >>> inferred.index_fields [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), True))] >>> def func() -> ps.DataFrame[pdf.index.dtype, pdf.dtypes]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>' >>> inferred.index_fields [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), True))] >>> def func() -> ps.DataFrame[ ... ("index", CategoricalDtype(categories=[3, 4, 5], ordered=False)), ... [("id", int), ("A", int)]]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [CategoricalDtype(categories=[3, 4, 5], ordered=False), dtype('int64'), dtype('int64')] >>> inferred.spark_type.simpleString() 'struct<index:bigint,id:bigint,A:bigint>' >>> inferred.index_fields [InternalField(dtype=category, struct_field=StructField('index', LongType(), True))] >>> def func() -> ps.DataFrame[ ... (pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,a:bigint,b:bigint>' >>> inferred.index_fields [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), True))] """ # We should re-import to make sure the class 'SeriesType' is not treated as a class # within this module locally. See Series.__class_getitem__ which imports this class # canonically. from pyspark.pandas.internal import InternalField, SPARK_INDEX_NAME_FORMAT from pyspark.pandas.typedef import SeriesType, NameTypeHolder, IndexNameTypeHolder from pyspark.pandas.utils import name_like_string tpe = get_type_hints(f).get("return", None) if tpe is None: raise ValueError("A return value is required for the input function") if hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, SeriesType): tpe = tpe.__args__[0] if issubclass(tpe, NameTypeHolder): tpe = tpe.tpe dtype, spark_type = pandas_on_spark_type(tpe) return SeriesType(dtype, spark_type) # Note that, DataFrame type hints will create a Tuple. # Tuple has _name but other types have __name__ name = getattr(tpe, "_name", getattr(tpe, "__name__", None)) # Check if the name is Tuple. if name == "Tuple": tuple_type = tpe parameters = getattr(tuple_type, "__args__") index_parameters = [ p for p in parameters if isclass(p) and issubclass(p, IndexNameTypeHolder) ] data_parameters = [p for p in parameters if p not in index_parameters] assert len( data_parameters) > 0, "Type hints for data must not be empty." index_fields = [] if len(index_parameters) >= 1: for level, index_parameter in enumerate(index_parameters): index_name = index_parameter.name index_dtype, index_spark_type = pandas_on_spark_type( index_parameter.tpe) index_fields.append( InternalField( dtype=index_dtype, struct_field=types.StructField( name=index_name if index_name is not None else SPARK_INDEX_NAME_FORMAT(level), dataType=index_spark_type, ), )) else: # No type hint for index. assert len(index_parameters) == 0 data_dtypes, data_spark_types = zip( *(pandas_on_spark_type(p.tpe) if isclass(p) and issubclass(p, NameTypeHolder) else pandas_on_spark_type(p) for p in data_parameters)) data_names = [ p.name if isclass(p) and issubclass(p, NameTypeHolder) else None for p in data_parameters ] data_fields = [] for i, (data_name, data_dtype, data_spark_type) in enumerate( zip(data_names, data_dtypes, data_spark_types)): data_fields.append( InternalField( dtype=data_dtype, struct_field=types.StructField( name=name_like_string(data_name) if data_name is not None else ("c%s" % i), dataType=data_spark_type, ), )) return DataFrameType(index_fields=index_fields, data_fields=data_fields) tpes = pandas_on_spark_type(tpe) if tpes is None: return UnknownType(tpe) else: return ScalarType(*tpes)