def nan_to_null(self, index_ops: IndexOpsLike) -> IndexOpsLike: # Special handle floating point types because Spark's count treats nan as a valid value, # whereas pandas count doesn't include nan. return index_ops._with_new_scol( F.nanvl(index_ops.spark.column, SF.lit(None)), field=index_ops._internal.data_fields[0].copy(nullable=True), )
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, _ = pandas_on_spark_type(dtype) if isinstance(dtype, CategoricalDtype) and cast(CategoricalDtype, dtype).categories is None: return index_ops.copy() return _to_cat(index_ops).astype(dtype)
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, _ = pandas_on_spark_type(dtype) if isinstance(dtype, CategoricalDtype) and cast(CategoricalDtype, dtype).categories is None: return index_ops.copy() categories = cast(CategoricalDtype, index_ops.dtype).categories if len(categories) == 0: scol = SF.lit(None) else: kvs = chain( *[(SF.lit(code), SF.lit(category)) for code, category in enumerate(categories)] ) map_scol = F.create_map(*kvs) scol = map_scol.getItem(index_ops.spark.column) return index_ops._with_new_scol(scol).astype(dtype)
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, spark_type = pandas_on_spark_type(dtype) if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) elif isinstance(spark_type, BooleanType): return _as_bool_type(index_ops, dtype) elif isinstance(spark_type, StringType): if isinstance(dtype, extension_dtypes): # seems like a pandas' bug? scol = F.when(index_ops.spark.column.isNull(), str(pd.NaT)).otherwise( index_ops.spark.column.cast(spark_type)) else: null_str = str(pd.NaT) casted = index_ops.spark.column.cast(spark_type) scol = F.when(index_ops.spark.column.isNull(), null_str).otherwise(casted) return index_ops._with_new_scol( scol.alias(index_ops._internal.data_spark_column_names[0]), field=index_ops._internal.data_fields[0].copy( dtype=dtype, spark_type=spark_type), ) else: return _as_other_type(index_ops, dtype, spark_type)
def isnull(self, index_ops: IndexOpsLike) -> IndexOpsLike: return index_ops._with_new_scol( index_ops.spark.column.isNull() | F.isnan(index_ops.spark.column), field=index_ops._internal.data_fields[0].copy( dtype=np.dtype("bool"), spark_type=BooleanType(), nullable=False ), )
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, spark_type = pandas_on_spark_type(dtype) if is_integer_dtype(dtype) and not isinstance(dtype, extension_dtypes): if index_ops.hasnans: raise ValueError( "Cannot convert %s with missing values to integer" % self.pretty_name ) if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) elif isinstance(spark_type, BooleanType): if isinstance(dtype, extension_dtypes): scol = index_ops.spark.column.cast(spark_type) else: scol = F.when( index_ops.spark.column.isNull() | F.isnan(index_ops.spark.column), SF.lit(True), ).otherwise(index_ops.spark.column.cast(spark_type)) return index_ops._with_new_scol( scol.alias(index_ops._internal.data_spark_column_names[0]), field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type), ) elif isinstance(spark_type, StringType): return _as_string_type(index_ops, dtype, null_str=str(np.nan)) else: return _as_other_type(index_ops, dtype, spark_type)
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, spark_type = pandas_on_spark_type(dtype) if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) elif isinstance(spark_type, BooleanType): return _as_bool_type(index_ops, dtype) elif isinstance(spark_type, StringType): if isinstance(dtype, extension_dtypes): scol = F.when( index_ops.spark.column.isNotNull(), F.when(index_ops.spark.column, "True").otherwise("False"), ) nullable = index_ops.spark.nullable else: null_str = str(pd.NA) if isinstance( self, BooleanExtensionOps) else str(None) casted = F.when(index_ops.spark.column, "True").otherwise("False") scol = F.when(index_ops.spark.column.isNull(), null_str).otherwise(casted) nullable = False return index_ops._with_new_scol( scol, field=index_ops._internal.data_fields[0].copy( dtype=dtype, spark_type=spark_type, nullable=nullable), ) else: return _as_other_type(index_ops, dtype, spark_type)
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, spark_type = pandas_on_spark_type(dtype) if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) elif isinstance(spark_type, BooleanType): return _as_bool_type(index_ops, dtype) elif isinstance(spark_type, StringType): if isinstance(dtype, extension_dtypes): scol = F.when( index_ops.spark.column.isNotNull(), F.when(index_ops.spark.column, "True").otherwise("False"), ) else: null_str = str(None) casted = F.when(index_ops.spark.column, "True").otherwise("False") scol = F.when(index_ops.spark.column.isNull(), null_str).otherwise(casted) return index_ops._with_new_scol( scol.alias(index_ops._internal.data_spark_column_names[0]), field=InternalField(dtype=dtype), ) else: return _as_other_type(index_ops, dtype, spark_type)
def _to_cat(index_ops: IndexOpsLike) -> IndexOpsLike: categories = cast(CategoricalDtype, index_ops.dtype).categories if len(categories) == 0: scol = SF.lit(None) else: kvs = chain(*[(SF.lit(code), SF.lit(category)) for code, category in enumerate(categories)]) map_scol = F.create_map(*kvs) scol = map_scol[index_ops.spark.column] return index_ops._with_new_scol(scol)
def radd(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: if isinstance(right, str): return cast( SeriesOrIndex, left._with_new_scol(F.concat(SF.lit(right), left.spark.column), field=left._internal.data_fields[0]), ) else: raise TypeError("Addition can not be applied to given types.")
def mul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: _sanitize_list_like(right) if not is_valid_operand_for_numeric_arithmetic(right): raise TypeError( "Multiplication can not be applied to %s and the given type." % self.pretty_name ) if isinstance(right, bool): return left.__and__(right) elif isinstance(right, numbers.Number): left = transform_boolean_operand_to_numeric(left, spark_type=as_spark_type(type(right))) return left * right else: assert isinstance(right, IndexOpsMixin) if isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, BooleanType): return left.__and__(right) else: left = transform_boolean_operand_to_numeric(left, spark_type=right.spark.data_type) return left * right
def rmul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: if isinstance(right, int): return cast( SeriesOrIndex, left._with_new_scol( SF.repeat(left.spark.column, right), field=left._internal.data_fields[0] ), ) else: raise TypeError("Multiplication can not be applied to given types.")
def radd(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: if isinstance(right, bytes): return cast( SeriesOrIndex, left._with_new_scol(F.concat(SF.lit(right), left.spark.column))) else: raise TypeError( "Concatenation can not be applied to %s and the given type." % self.pretty_name)
def radd(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: _sanitize_list_like(right) if isinstance(right, bool): return left.__or__(right) elif isinstance(right, numbers.Number): left = transform_boolean_operand_to_numeric(left, spark_type=as_spark_type(type(right))) return right + left else: raise TypeError( "Addition can not be applied to %s and the given type." % self.pretty_name )
def rmul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: if isinstance(right, bool): return left.__and__(right) elif isinstance(right, numbers.Number): left = left.spark.transform( lambda scol: scol.cast(as_spark_type(type(right)))) return right * left else: raise TypeError( "Multiplication can not be applied to %s and the given type." % self.pretty_name)
def rmul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: if isinstance(right, bool): return left.__and__(right) elif isinstance(right, numbers.Number): left = transform_boolean_operand_to_numeric( left, spark_type=as_spark_type(type(right))) return right * left else: raise TypeError( "Multiplication can not be applied to %s and the given type." % self.pretty_name)
def add(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: if not is_valid_operand_for_numeric_arithmetic(right): raise TypeError( "Addition can not be applied to %s and the given type." % self.pretty_name) if isinstance(right, bool): return left.__or__(right) elif isinstance(right, numbers.Number): left = left.spark.transform( lambda scol: scol.cast(as_spark_type(type(right)))) return left + right else: assert isinstance(right, IndexOpsMixin) if isinstance(right, IndexOpsMixin) and isinstance( right.spark.data_type, BooleanType): return left.__or__(right) else: left = transform_boolean_operand_to_numeric( left, right.spark.data_type) return left + right
def _as_bool_type(index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: """Cast `index_ops` to BooleanType Spark type, given `dtype`.""" spark_type = BooleanType() if isinstance(dtype, extension_dtypes): scol = index_ops.spark.column.cast(spark_type) else: scol = F.when(index_ops.spark.column.isNull(), SF.lit(False)).otherwise( index_ops.spark.column.cast(spark_type) ) return index_ops._with_new_scol( scol, field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type) )
def add(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: if isinstance(right, str): return cast( SeriesOrIndex, left._with_new_scol( F.concat(left.spark.column, SF.lit(right)), field=left._internal.data_fields[0] ), ) elif isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType): return column_op(F.concat)(left, right) else: raise TypeError("Addition can not be applied to given types.")
def rsub(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: _sanitize_list_like(right) # Note that timestamp subtraction casts arguments to integer. This is to mimic pandas's # behaviors. pandas returns 'timedelta64[ns]' from 'datetime64[ns]'s subtraction. msg = ( "Note that there is a behavior difference of timestamp subtraction. " "The timestamp subtraction returns an integer in seconds, " "whereas pandas returns 'timedelta64[ns]'.") if isinstance(right, datetime.datetime): warnings.warn(msg, UserWarning) return cast( SeriesOrIndex, left._with_new_scol( self._cast_spark_column_timestamp_to_long(SF.lit(right)) - left.astype("long").spark.column, field=left._internal.data_fields[0].copy( dtype=np.dtype("int64"), spark_type=LongType()), ), ) else: raise TypeError( "Datetime subtraction can only be applied to datetime series.")
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, spark_type = pandas_on_spark_type(dtype) if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) elif isinstance(spark_type, BooleanType): # Cannot cast binary to boolean in Spark. # We should cast binary to str first, and cast it to boolean return index_ops.astype(str).astype(bool) elif isinstance(spark_type, StringType): return _as_string_type(index_ops, dtype) else: return _as_other_type(index_ops, dtype, spark_type)
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, spark_type = pandas_on_spark_type(dtype) if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) elif isinstance(spark_type, NumericType): from pyspark.pandas.internal import InternalField scol = self._cast_spark_column_timestamp_to_long(index_ops.spark.column).cast( spark_type ) return index_ops._with_new_scol(scol, field=InternalField(dtype=dtype)) else: return super(DatetimeNTZOps, self).astype(index_ops, dtype)
def mul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: if isinstance(right, int): return cast( SeriesOrIndex, left._with_new_scol(SF.repeat(left.spark.column, right), field=left._internal.data_fields[0]), ) elif (isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, IntegralType) and not isinstance(right.dtype, CategoricalDtype)): return column_op(SF.repeat)(left, right) else: raise TypeError( "Multiplication can not be applied to given types.")
def _as_bool_type(index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: """Cast `index_ops` to BooleanType Spark type, given `dtype`.""" from pyspark.pandas.internal import InternalField if isinstance(dtype, extension_dtypes): scol = index_ops.spark.column.cast(BooleanType()) else: scol = F.when(index_ops.spark.column.isNull(), SF.lit(False)).otherwise( index_ops.spark.column.cast(BooleanType())) return index_ops._with_new_scol( scol.alias(index_ops._internal.data_spark_column_names[0]), field=InternalField(dtype=dtype), )
def _as_string_type( index_ops: IndexOpsLike, dtype: Union[str, type, Dtype], *, null_str: str = str(None) ) -> IndexOpsLike: """Cast `index_ops` to StringType Spark type, given `dtype` and `null_str`, representing null Spark column. Note that `null_str` is for non-extension dtypes only. """ spark_type = StringType() if isinstance(dtype, extension_dtypes): scol = index_ops.spark.column.cast(spark_type) else: casted = index_ops.spark.column.cast(spark_type) scol = F.when(index_ops.spark.column.isNull(), null_str).otherwise(casted) return index_ops._with_new_scol( scol, field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type) )
def _as_other_type(index_ops: IndexOpsLike, dtype: Union[str, type, Dtype], spark_type: DataType) -> IndexOpsLike: """Cast `index_ops` to a `dtype` (`spark_type`) that needs no pre-processing. Destination types that need pre-processing: CategoricalDtype, BooleanType, and StringType. """ from pyspark.pandas.internal import InternalField need_pre_process = (isinstance(dtype, CategoricalDtype) or isinstance(spark_type, BooleanType) or isinstance(spark_type, StringType)) assert not need_pre_process, "Pre-processing is needed before the type casting." scol = index_ops.spark.column.cast(spark_type) return index_ops._with_new_scol(scol, field=InternalField(dtype=dtype))
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, spark_type = pandas_on_spark_type(dtype) if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) elif isinstance(spark_type, BooleanType): return index_ops._with_new_scol( index_ops.spark.column.isNotNull(), field=index_ops._internal.data_fields[0].copy( dtype=np.dtype(bool), spark_type=spark_type, nullable=False), ) elif isinstance(spark_type, StringType): return _as_string_type(index_ops, dtype, null_str=str(pd.NaT)) else: return _as_other_type(index_ops, dtype, spark_type)
def _as_string_type(index_ops: IndexOpsLike, dtype: Union[str, type, Dtype], *, null_str: str = str(None)) -> IndexOpsLike: """Cast `index_ops` to StringType Spark type, given `dtype` and `null_str`, representing null Spark column. """ from pyspark.pandas.internal import InternalField if isinstance(dtype, extension_dtypes): scol = index_ops.spark.column.cast(StringType()) else: casted = index_ops.spark.column.cast(StringType()) scol = F.when(index_ops.spark.column.isNull(), null_str).otherwise(casted) return index_ops._with_new_scol( scol.alias(index_ops._internal.data_spark_column_names[0]), field=InternalField(dtype=dtype), )
def sub(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: # Note that timestamp subtraction casts arguments to integer. This is to mimic pandas's # behaviors. pandas returns 'timedelta64[ns]' from 'datetime64[ns]'s subtraction. msg = ( "Note that there is a behavior difference of timestamp subtraction. " "The timestamp subtraction returns an integer in seconds, " "whereas pandas returns 'timedelta64[ns]'.") if isinstance(right, IndexOpsMixin) and isinstance( right.spark.data_type, TimestampType): warnings.warn(msg, UserWarning) return left.astype("long") - right.astype("long") elif isinstance(right, datetime.datetime): warnings.warn(msg, UserWarning) return cast( SeriesOrIndex, left.spark.transform(lambda scol: scol.astype("long") - SF.lit( right).cast(as_spark_type("long"))), ) else: raise TypeError( "datetime subtraction can only be applied to datetime series.")
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, spark_type = pandas_on_spark_type(dtype) if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) if isinstance(spark_type, BooleanType): if isinstance(dtype, extension_dtypes): scol = index_ops.spark.column.cast(spark_type) else: scol = F.when(index_ops.spark.column.isNull(), SF.lit(False)).otherwise( F.length(index_ops.spark.column) > 0 ) return index_ops._with_new_scol( scol, field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type), ) elif isinstance(spark_type, StringType): null_str = str(pd.NA) if isinstance(self, StringExtensionOps) else str(None) return _as_string_type(index_ops, dtype, null_str=null_str) else: return _as_other_type(index_ops, dtype, spark_type)