Пример #1
0
    def _get_plot_backend(backend=None):
        backend = backend or get_option("plotting.backend")
        # Shortcut
        if backend in PandasOnSparkPlotAccessor._backends:
            return PandasOnSparkPlotAccessor._backends[backend]

        if backend == "matplotlib":
            # Because matplotlib is an optional dependency,
            # we need to attempt an import here to raise an ImportError if needed.
            try:
                # test if matplotlib can be imported
                import matplotlib  # noqa: F401
                from pyspark.pandas.plot import matplotlib as module
            except ImportError:
                raise ImportError(
                    "matplotlib is required for plotting when the "
                    "default backend 'matplotlib' is selected.") from None

            PandasOnSparkPlotAccessor._backends["matplotlib"] = module
        elif backend == "plotly":
            try:
                # test if plotly can be imported
                import plotly  # noqa: F401
                from pyspark.pandas.plot import plotly as module
            except ImportError:
                raise ImportError(
                    "plotly is required for plotting when the "
                    "default backend 'plotly' is selected.") from None

            PandasOnSparkPlotAccessor._backends["plotly"] = module
        else:
            module = PandasOnSparkPlotAccessor._find_backend(backend)
            PandasOnSparkPlotAccessor._backends[backend] = module
        return module
Пример #2
0
    def get_sampled(self, data):
        from pyspark.pandas import DataFrame, Series

        fraction = get_option("plotting.sample_ratio")
        if fraction is None:
            fraction = 1 / (len(data) / get_option("plotting.max_rows"))
            fraction = min(1.0, fraction)
        self.fraction = fraction

        if isinstance(data, (DataFrame, Series)):
            if isinstance(data, Series):
                data = data.to_frame()
            sampled = data._internal.resolved_copy.spark_frame.sample(fraction=self.fraction)
            return DataFrame(data._internal.with_new_sdf(sampled)).to_pandas()
        else:
            raise ValueError("Only DataFrame and Series are supported for plotting.")
Пример #3
0
    def astype(self, index_ops: IndexOpsLike,
               dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if is_integer_dtype(dtype) and not isinstance(dtype, extension_dtypes):
            if get_option("compute.eager_check") and index_ops.hasnans:
                raise ValueError(
                    "Cannot convert %s with missing values to integer" %
                    self.pretty_name)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, BooleanType):
            if isinstance(dtype, extension_dtypes):
                scol = index_ops.spark.column.cast(spark_type)
            else:
                scol = F.when(
                    index_ops.spark.column.isNull()
                    | F.isnan(index_ops.spark.column),
                    SF.lit(True),
                ).otherwise(index_ops.spark.column.cast(spark_type))
            return index_ops._with_new_scol(
                scol.alias(index_ops._internal.data_spark_column_names[0]),
                field=index_ops._internal.data_fields[0].copy(
                    dtype=dtype, spark_type=spark_type),
            )
        elif isinstance(spark_type, StringType):
            return _as_string_type(index_ops, dtype, null_str=str(np.nan))
        else:
            return _as_other_type(index_ops, dtype, spark_type)
Пример #4
0
 def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike:
     dtype, spark_type = pandas_on_spark_type(dtype)
     if is_integer_dtype(dtype) and not isinstance(dtype, extension_dtypes):
         if get_option("compute.eager_check") and index_ops.hasnans:
             raise ValueError(
                 "Cannot convert %s with missing values to integer" % self.pretty_name
             )
     return _non_fractional_astype(index_ops, dtype, spark_type)
Пример #5
0
    def set_result_text(self, ax):
        max_rows = get_option("plotting.max_rows")
        assert hasattr(self, "partial")

        if self.partial:
            ax.text(
                1,
                1,
                "showing top {} elements only".format(max_rows),
                size=6,
                ha="right",
                va="bottom",
                transform=ax.transAxes,
            )
Пример #6
0
    def get_top_n(self, data):
        from pyspark.pandas import DataFrame, Series

        max_rows = get_option("plotting.max_rows")
        # Simply use the first 1k elements and make it into a pandas dataframe
        # For categorical variables, it is likely called from df.x.value_counts().plot.xxx().
        if isinstance(data, (Series, DataFrame)):
            data = data.head(max_rows + 1).to_pandas()
        else:
            raise ValueError("Only DataFrame and Series are supported for plotting.")

        self.partial = False
        if len(data) > max_rows:
            self.partial = True
            data = data.iloc[:max_rows]
        return data
Пример #7
0
def combine_frames(
    this: "DataFrame",
    *args: DataFrameOrSeries,
    how: str = "full",
    preserve_order_column: bool = False
) -> "DataFrame":
    """
    This method combines `this` DataFrame with a different `that` DataFrame or
    Series from a different DataFrame.

    It returns a DataFrame that has prefix `this_` and `that_` to distinct
    the columns names from both DataFrames

    It internally performs a join operation which can be expensive in general.
    So, if `compute.ops_on_diff_frames` option is False,
    this method throws an exception.
    """
    from pyspark.pandas.config import get_option
    from pyspark.pandas.frame import DataFrame
    from pyspark.pandas.internal import (
        InternalField,
        InternalFrame,
        HIDDEN_COLUMNS,
        NATURAL_ORDER_COLUMN_NAME,
        SPARK_INDEX_NAME_FORMAT,
    )
    from pyspark.pandas.series import Series

    if all(isinstance(arg, Series) for arg in args):
        assert all(
            same_anchor(arg, args[0]) for arg in args
        ), "Currently only one different DataFrame (from given Series) is supported"
        assert not same_anchor(this, args[0]), "We don't need to combine. All series is in this."
        that = args[0]._psdf[list(args)]
    elif len(args) == 1 and isinstance(args[0], DataFrame):
        assert isinstance(args[0], DataFrame)
        assert not same_anchor(
            this, args[0]
        ), "We don't need to combine. `this` and `that` are same."
        that = args[0]
    else:
        raise AssertionError("args should be single DataFrame or " "single/multiple Series")

    if get_option("compute.ops_on_diff_frames"):

        def resolve(internal: InternalFrame, side: str) -> InternalFrame:
            rename = lambda col: "__{}_{}".format(side, col)
            internal = internal.resolved_copy
            sdf = internal.spark_frame
            sdf = internal.spark_frame.select(
                *[
                    scol_for(sdf, col).alias(rename(col))
                    for col in sdf.columns
                    if col not in HIDDEN_COLUMNS
                ],
                *HIDDEN_COLUMNS
            )
            return internal.copy(
                spark_frame=sdf,
                index_spark_columns=[
                    scol_for(sdf, rename(col)) for col in internal.index_spark_column_names
                ],
                index_fields=[
                    field.copy(name=rename(field.name)) for field in internal.index_fields
                ],
                data_spark_columns=[
                    scol_for(sdf, rename(col)) for col in internal.data_spark_column_names
                ],
                data_fields=[field.copy(name=rename(field.name)) for field in internal.data_fields],
            )

        this_internal = resolve(this._internal, "this")
        that_internal = resolve(that._internal, "that")

        this_index_map = list(
            zip(
                this_internal.index_spark_column_names,
                this_internal.index_names,
                this_internal.index_fields,
            )
        )
        that_index_map = list(
            zip(
                that_internal.index_spark_column_names,
                that_internal.index_names,
                that_internal.index_fields,
            )
        )
        assert len(this_index_map) == len(that_index_map)

        join_scols = []
        merged_index_scols = []

        # Note that the order of each element in index_map is guaranteed according to the index
        # level.
        this_and_that_index_map = list(zip(this_index_map, that_index_map))

        this_sdf = this_internal.spark_frame.alias("this")
        that_sdf = that_internal.spark_frame.alias("that")

        # If the same named index is found, that's used.
        index_column_names = []
        index_use_extension_dtypes = []
        for (
            i,
            ((this_column, this_name, this_field), (that_column, that_name, that_field)),
        ) in enumerate(this_and_that_index_map):
            if this_name == that_name:
                # We should merge the Spark columns into one
                # to mimic pandas' behavior.
                this_scol = scol_for(this_sdf, this_column)
                that_scol = scol_for(that_sdf, that_column)
                join_scol = this_scol == that_scol
                join_scols.append(join_scol)

                column_name = SPARK_INDEX_NAME_FORMAT(i)
                index_column_names.append(column_name)
                index_use_extension_dtypes.append(
                    any(field.is_extension_dtype for field in [this_field, that_field])
                )
                merged_index_scols.append(
                    F.when(this_scol.isNotNull(), this_scol).otherwise(that_scol).alias(column_name)
                )
            else:
                raise ValueError("Index names must be exactly matched currently.")

        assert len(join_scols) > 0, "cannot join with no overlapping index names"

        joined_df = this_sdf.join(that_sdf, on=join_scols, how=how)

        if preserve_order_column:
            order_column = [scol_for(this_sdf, NATURAL_ORDER_COLUMN_NAME)]
        else:
            order_column = []

        joined_df = joined_df.select(
            *merged_index_scols,
            *(
                scol_for(this_sdf, this_internal.spark_column_name_for(label))
                for label in this_internal.column_labels
            ),
            *(
                scol_for(that_sdf, that_internal.spark_column_name_for(label))
                for label in that_internal.column_labels
            ),
            *order_column
        )

        index_spark_columns = [scol_for(joined_df, col) for col in index_column_names]

        index_columns = set(index_column_names)
        new_data_columns = [
            col
            for col in joined_df.columns
            if col not in index_columns and col != NATURAL_ORDER_COLUMN_NAME
        ]

        schema = joined_df.select(*index_spark_columns, *new_data_columns).schema

        index_fields = [
            InternalField.from_struct_field(struct_field, use_extension_dtypes=use_extension_dtypes)
            for struct_field, use_extension_dtypes in zip(
                schema.fields[: len(index_spark_columns)], index_use_extension_dtypes
            )
        ]
        data_fields = [
            InternalField.from_struct_field(
                struct_field, use_extension_dtypes=field.is_extension_dtype
            )
            for struct_field, field in zip(
                schema.fields[len(index_spark_columns) :],
                this_internal.data_fields + that_internal.data_fields,
            )
        ]

        level = max(this_internal.column_labels_level, that_internal.column_labels_level)

        def fill_label(label: Optional[Tuple]) -> List:
            if label is None:
                return ([""] * (level - 1)) + [None]
            else:
                return ([""] * (level - len(label))) + list(label)

        column_labels = [
            tuple(["this"] + fill_label(label)) for label in this_internal.column_labels
        ] + [tuple(["that"] + fill_label(label)) for label in that_internal.column_labels]
        column_label_names = (
            cast(List[Optional[Tuple]], [None]) * (1 + level - this_internal.column_labels_level)
        ) + this_internal.column_label_names
        return DataFrame(
            InternalFrame(
                spark_frame=joined_df,
                index_spark_columns=index_spark_columns,
                index_names=this_internal.index_names,
                index_fields=index_fields,
                column_labels=column_labels,
                data_spark_columns=[scol_for(joined_df, col) for col in new_data_columns],
                data_fields=data_fields,
                column_label_names=column_label_names,
            )
        )
    else:
        raise ValueError(ERROR_MESSAGE_CANNOT_COMBINE)