Exemplo n.º 1
0
 def isnull(self, index_ops: IndexOpsLike) -> IndexOpsLike:
     return index_ops._with_new_scol(
         index_ops.spark.column.isNull(),
         field=index_ops._internal.data_fields[0].copy(
             dtype=np.dtype("bool"), spark_type=BooleanType(), nullable=False
         ),
     )
Exemplo n.º 2
0
 def sub(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
     # Note that timestamp subtraction casts arguments to integer. This is to mimic pandas's
     # behaviors. pandas returns 'timedelta64[ns]' from 'datetime64[ns]'s subtraction.
     msg = (
         "Note that there is a behavior difference of timestamp subtraction. "
         "The timestamp subtraction returns an integer in seconds, "
         "whereas pandas returns 'timedelta64[ns]'.")
     if isinstance(right, IndexOpsMixin) and isinstance(
             right.spark.data_type, TimestampType):
         warnings.warn(msg, UserWarning)
         return left.astype("long") - right.astype("long")
     elif isinstance(right, datetime.datetime):
         warnings.warn(msg, UserWarning)
         return cast(
             SeriesOrIndex,
             left._with_new_scol(
                 left.spark.column.cast(LongType()) -
                 SF.lit(right).cast(LongType()),
                 field=left._internal.data_fields[0].copy(
                     dtype=np.dtype("int64"), spark_type=LongType()),
             ),
         )
     else:
         raise TypeError(
             "Datetime subtraction can only be applied to datetime series.")
Exemplo n.º 3
0
    def astype(self, index_ops: IndexOpsLike,
               dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, BooleanType):
            return _as_bool_type(index_ops, dtype)
        elif isinstance(spark_type, StringType):
            if isinstance(dtype, extension_dtypes):
                scol = F.when(
                    index_ops.spark.column.isNotNull(),
                    F.when(index_ops.spark.column, "True").otherwise("False"),
                )
                nullable = index_ops.spark.nullable
            else:
                null_str = str(pd.NA) if isinstance(
                    self, BooleanExtensionOps) else str(None)
                casted = F.when(index_ops.spark.column,
                                "True").otherwise("False")
                scol = F.when(index_ops.spark.column.isNull(),
                              null_str).otherwise(casted)
                nullable = False
            return index_ops._with_new_scol(
                scol,
                field=index_ops._internal.data_fields[0].copy(
                    dtype=dtype, spark_type=spark_type, nullable=nullable),
            )
        else:
            return _as_other_type(index_ops, dtype, spark_type)
Exemplo n.º 4
0
def _as_categorical_type(index_ops: IndexOpsLike, dtype: CategoricalDtype,
                         spark_type: DataType) -> IndexOpsLike:
    """Cast `index_ops` to categorical dtype, given `dtype` and `spark_type`."""
    assert isinstance(dtype, CategoricalDtype)
    if dtype.categories is None:
        codes, uniques = index_ops.factorize()
        return codes._with_new_scol(
            codes.spark.column,
            field=codes._internal.data_fields[0].copy(dtype=CategoricalDtype(
                categories=uniques)),
        )
    else:
        categories = dtype.categories
        if len(categories) == 0:
            scol = SF.lit(-1)
        else:
            kvs = chain(*[(SF.lit(category), SF.lit(code))
                          for code, category in enumerate(categories)])
            map_scol = F.create_map(*kvs)

            scol = F.coalesce(map_scol[index_ops.spark.column], SF.lit(-1))
        return index_ops._with_new_scol(
            scol.cast(spark_type),
            field=index_ops._internal.data_fields[0].copy(
                dtype=dtype, spark_type=spark_type, nullable=False),
        )
Exemplo n.º 5
0
    def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if is_integer_dtype(dtype) and not isinstance(dtype, extension_dtypes):
            if index_ops.hasnans:
                raise ValueError(
                    "Cannot convert %s with missing values to integer" % self.pretty_name
                )
        elif is_bool_dtype(dtype) and not isinstance(dtype, extension_dtypes):
            if index_ops.hasnans:
                raise ValueError("Cannot convert %s with missing values to bool" % self.pretty_name)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, BooleanType):
            if isinstance(dtype, extension_dtypes):
                scol = index_ops.spark.column.cast(spark_type)
            else:
                scol = F.when(
                    index_ops.spark.column.isNull() | F.isnan(index_ops.spark.column),
                    SF.lit(True),
                ).otherwise(index_ops.spark.column.cast(spark_type))
            return index_ops._with_new_scol(
                scol.alias(index_ops._internal.data_spark_column_names[0]),
                field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type),
            )
        elif isinstance(spark_type, StringType):
            return _as_string_type(index_ops, dtype, null_str=str(np.nan))
        else:
            return _as_other_type(index_ops, dtype, spark_type)
Exemplo n.º 6
0
    def astype(self, index_ops: IndexOpsLike,
               dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, BooleanType):
            return _as_bool_type(index_ops, dtype)
        elif isinstance(spark_type, StringType):
            if isinstance(dtype, extension_dtypes):
                scol = F.when(
                    index_ops.spark.column.isNotNull(),
                    F.when(index_ops.spark.column, "True").otherwise("False"),
                )
            else:
                null_str = str(None)
                casted = F.when(index_ops.spark.column,
                                "True").otherwise("False")
                scol = F.when(index_ops.spark.column.isNull(),
                              null_str).otherwise(casted)
            return index_ops._with_new_scol(
                scol.alias(index_ops._internal.data_spark_column_names[0]),
                field=InternalField(dtype=dtype),
            )
        else:
            return _as_other_type(index_ops, dtype, spark_type)
Exemplo n.º 7
0
    def astype(self, index_ops: IndexOpsLike,
               dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, BooleanType):
            return _as_bool_type(index_ops, dtype)
        elif isinstance(spark_type, StringType):
            if isinstance(dtype, extension_dtypes):
                # seems like a pandas' bug?
                scol = F.when(index_ops.spark.column.isNull(),
                              str(pd.NaT)).otherwise(
                                  index_ops.spark.column.cast(spark_type))
            else:
                null_str = str(pd.NaT)
                casted = index_ops.spark.column.cast(spark_type)
                scol = F.when(index_ops.spark.column.isNull(),
                              null_str).otherwise(casted)
            return index_ops._with_new_scol(
                scol.alias(index_ops._internal.data_spark_column_names[0]),
                field=index_ops._internal.data_fields[0].copy(
                    dtype=dtype, spark_type=spark_type),
            )
        else:
            return _as_other_type(index_ops, dtype, spark_type)
Exemplo n.º 8
0
 def nan_to_null(self, index_ops: IndexOpsLike) -> IndexOpsLike:
     # Special handle floating point types because Spark's count treats nan as a valid value,
     # whereas pandas count doesn't include nan.
     return index_ops._with_new_scol(
         F.nanvl(index_ops.spark.column, SF.lit(None)),
         field=index_ops._internal.data_fields[0].copy(nullable=True),
     )
Exemplo n.º 9
0
def _to_cat(index_ops: IndexOpsLike) -> IndexOpsLike:
    categories = cast(CategoricalDtype, index_ops.dtype).categories
    if len(categories) == 0:
        scol = SF.lit(None)
    else:
        kvs = chain(*[(SF.lit(code), SF.lit(category)) for code, category in enumerate(categories)])
        map_scol = F.create_map(*kvs)
        scol = map_scol[index_ops.spark.column]
    return index_ops._with_new_scol(scol)
Exemplo n.º 10
0
 def radd(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
     if isinstance(right, str):
         return cast(
             SeriesOrIndex,
             left._with_new_scol(F.concat(SF.lit(right), left.spark.column),
                                 field=left._internal.data_fields[0]),
         )
     else:
         raise TypeError("Addition can not be applied to given types.")
Exemplo n.º 11
0
 def rmul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
     if isinstance(right, int):
         return cast(
             SeriesOrIndex,
             left._with_new_scol(
                 SF.repeat(left.spark.column, right), field=left._internal.data_fields[0]
             ),
         )
     else:
         raise TypeError("Multiplication can not be applied to given types.")
Exemplo n.º 12
0
 def radd(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
     if isinstance(right, bytes):
         return cast(
             SeriesOrIndex,
             left._with_new_scol(F.concat(SF.lit(right),
                                          left.spark.column)))
     else:
         raise TypeError(
             "Concatenation can not be applied to %s and the given type." %
             self.pretty_name)
Exemplo n.º 13
0
 def add(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
     if isinstance(right, str):
         return cast(
             SeriesOrIndex,
             left._with_new_scol(
                 F.concat(left.spark.column, SF.lit(right)), field=left._internal.data_fields[0]
             ),
         )
     elif isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType):
         return column_op(F.concat)(left, right)
     else:
         raise TypeError("Addition can not be applied to given types.")
Exemplo n.º 14
0
def _as_bool_type(index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike:
    """Cast `index_ops` to BooleanType Spark type, given `dtype`."""
    spark_type = BooleanType()
    if isinstance(dtype, extension_dtypes):
        scol = index_ops.spark.column.cast(spark_type)
    else:
        scol = F.when(index_ops.spark.column.isNull(), SF.lit(False)).otherwise(
            index_ops.spark.column.cast(spark_type)
        )
    return index_ops._with_new_scol(
        scol, field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type)
    )
Exemplo n.º 15
0
    def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, NumericType):
            from pyspark.pandas.internal import InternalField

            scol = self._cast_spark_column_timestamp_to_long(index_ops.spark.column).cast(
                spark_type
            )
            return index_ops._with_new_scol(scol, field=InternalField(dtype=dtype))
        else:
            return super(DatetimeNTZOps, self).astype(index_ops, dtype)
Exemplo n.º 16
0
 def mul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
     if isinstance(right, int):
         return cast(
             SeriesOrIndex,
             left._with_new_scol(SF.repeat(left.spark.column, right),
                                 field=left._internal.data_fields[0]),
         )
     elif (isinstance(right, IndexOpsMixin)
           and isinstance(right.spark.data_type, IntegralType)
           and not isinstance(right.dtype, CategoricalDtype)):
         return column_op(SF.repeat)(left, right)
     else:
         raise TypeError(
             "Multiplication can not be applied to given types.")
Exemplo n.º 17
0
def _as_other_type(index_ops: IndexOpsLike, dtype: Union[str, type, Dtype],
                   spark_type: DataType) -> IndexOpsLike:
    """Cast `index_ops` to a `dtype` (`spark_type`) that needs no pre-processing.

    Destination types that need pre-processing: CategoricalDtype, BooleanType, and StringType.
    """
    from pyspark.pandas.internal import InternalField

    need_pre_process = (isinstance(dtype, CategoricalDtype)
                        or isinstance(spark_type, BooleanType)
                        or isinstance(spark_type, StringType))
    assert not need_pre_process, "Pre-processing is needed before the type casting."

    scol = index_ops.spark.column.cast(spark_type)
    return index_ops._with_new_scol(scol, field=InternalField(dtype=dtype))
Exemplo n.º 18
0
def _as_bool_type(index_ops: IndexOpsLike,
                  dtype: Union[str, type, Dtype]) -> IndexOpsLike:
    """Cast `index_ops` to BooleanType Spark type, given `dtype`."""
    from pyspark.pandas.internal import InternalField

    if isinstance(dtype, extension_dtypes):
        scol = index_ops.spark.column.cast(BooleanType())
    else:
        scol = F.when(index_ops.spark.column.isNull(),
                      SF.lit(False)).otherwise(
                          index_ops.spark.column.cast(BooleanType()))
    return index_ops._with_new_scol(
        scol.alias(index_ops._internal.data_spark_column_names[0]),
        field=InternalField(dtype=dtype),
    )
Exemplo n.º 19
0
def _as_string_type(
    index_ops: IndexOpsLike, dtype: Union[str, type, Dtype], *, null_str: str = str(None)
) -> IndexOpsLike:
    """Cast `index_ops` to StringType Spark type, given `dtype` and `null_str`,
    representing null Spark column. Note that `null_str` is for non-extension dtypes only.
    """
    spark_type = StringType()
    if isinstance(dtype, extension_dtypes):
        scol = index_ops.spark.column.cast(spark_type)
    else:
        casted = index_ops.spark.column.cast(spark_type)
        scol = F.when(index_ops.spark.column.isNull(), null_str).otherwise(casted)
    return index_ops._with_new_scol(
        scol, field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type)
    )
Exemplo n.º 20
0
    def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, _ = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype) and cast(CategoricalDtype, dtype).categories is None:
            return index_ops.copy()

        categories = cast(CategoricalDtype, index_ops.dtype).categories
        if len(categories) == 0:
            scol = SF.lit(None)
        else:
            kvs = chain(
                *[(SF.lit(code), SF.lit(category)) for code, category in enumerate(categories)]
            )
            map_scol = F.create_map(*kvs)
            scol = map_scol.getItem(index_ops.spark.column)
        return index_ops._with_new_scol(scol).astype(dtype)
Exemplo n.º 21
0
    def astype(self, index_ops: IndexOpsLike,
               dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, BooleanType):
            return index_ops._with_new_scol(
                index_ops.spark.column.isNotNull(),
                field=index_ops._internal.data_fields[0].copy(
                    dtype=np.dtype(bool),
                    spark_type=spark_type,
                    nullable=False),
            )
        elif isinstance(spark_type, StringType):
            return _as_string_type(index_ops, dtype, null_str=str(pd.NaT))
        else:
            return _as_other_type(index_ops, dtype, spark_type)
Exemplo n.º 22
0
def _as_string_type(index_ops: IndexOpsLike,
                    dtype: Union[str, type, Dtype],
                    *,
                    null_str: str = str(None)) -> IndexOpsLike:
    """Cast `index_ops` to StringType Spark type, given `dtype` and `null_str`,
    representing null Spark column.
    """
    from pyspark.pandas.internal import InternalField

    if isinstance(dtype, extension_dtypes):
        scol = index_ops.spark.column.cast(StringType())
    else:
        casted = index_ops.spark.column.cast(StringType())
        scol = F.when(index_ops.spark.column.isNull(),
                      null_str).otherwise(casted)
    return index_ops._with_new_scol(
        scol.alias(index_ops._internal.data_spark_column_names[0]),
        field=InternalField(dtype=dtype),
    )
Exemplo n.º 23
0
    def astype(self, index_ops: IndexOpsLike,
               dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)

        if isinstance(spark_type, BooleanType):
            if isinstance(dtype, extension_dtypes):
                scol = index_ops.spark.column.cast(spark_type)
            else:
                scol = F.when(index_ops.spark.column.isNull(),
                              SF.lit(False)).otherwise(
                                  F.length(index_ops.spark.column) > 0)
            return index_ops._with_new_scol(
                scol.alias(index_ops._internal.data_spark_column_names[0]),
                field=InternalField(dtype=dtype),
            )
        elif isinstance(spark_type, StringType):
            return _as_string_type(index_ops, dtype)
        else:
            return _as_other_type(index_ops, dtype, spark_type)
Exemplo n.º 24
0
    def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)

        if isinstance(spark_type, BooleanType):
            if isinstance(dtype, extension_dtypes):
                scol = index_ops.spark.column.cast(spark_type)
            else:
                scol = F.when(index_ops.spark.column.isNull(), SF.lit(False)).otherwise(
                    F.length(index_ops.spark.column) > 0
                )
            return index_ops._with_new_scol(
                scol,
                field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type),
            )
        elif isinstance(spark_type, StringType):
            null_str = str(pd.NA) if isinstance(self, StringExtensionOps) else str(None)
            return _as_string_type(index_ops, dtype, null_str=null_str)
        else:
            return _as_other_type(index_ops, dtype, spark_type)
Exemplo n.º 25
0
 def invert(self, operand: IndexOpsLike) -> IndexOpsLike:
     return operand._with_new_scol(
         F.bitwise_not(operand.spark.column), field=operand._internal.data_fields[0]
     )
Exemplo n.º 26
0
 def abs(self, operand: IndexOpsLike) -> IndexOpsLike:
     return operand._with_new_scol(
         F.abs(operand.spark.column), field=operand._internal.data_fields[0]
     )
Exemplo n.º 27
0
 def neg(self, operand: IndexOpsLike) -> IndexOpsLike:
     return operand._with_new_scol(-operand.spark.column, field=operand._internal.data_fields[0])