def test_toPandas_respect_session_timezone(self): df = self.spark.createDataFrame(self.data, schema=self.schema) timezone = "America/Los_Angeles" with self.sql_conf({"spark.sql.session.timeZone": timezone}): pdf_la, pdf_arrow_la = self._toPandas_arrow_toggle(df) assert_frame_equal(pdf_arrow_la, pdf_la) timezone = "America/New_York" with self.sql_conf({"spark.sql.session.timeZone": timezone}): pdf_ny, pdf_arrow_ny = self._toPandas_arrow_toggle(df) assert_frame_equal(pdf_arrow_ny, pdf_ny) self.assertFalse(pdf_ny.equals(pdf_la)) from pyspark.sql.pandas.types import _check_series_convert_timestamps_local_tz pdf_la_corrected = pdf_la.copy() for field in self.schema: if isinstance(field.dataType, TimestampType): pdf_la_corrected[ field. name] = _check_series_convert_timestamps_local_tz( pdf_la_corrected[field.name], timezone) assert_frame_equal(pdf_ny, pdf_la_corrected)
def toPandas(self): """ Returns the contents of this :class:`DataFrame` as Pandas ``pandas.DataFrame``. This is only available if Pandas is installed and available. .. versionadded:: 1.3.0 Notes ----- This method should only be used if the resulting Pandas's :class:`DataFrame` is expected to be small, as all the data is loaded into the driver's memory. Usage with spark.sql.execution.arrow.pyspark.enabled=True is experimental. Examples -------- >>> df.toPandas() # doctest: +SKIP age name 0 2 Alice 1 5 Bob """ from pyspark.sql.dataframe import DataFrame assert isinstance(self, DataFrame) from pyspark.sql.pandas.utils import require_minimum_pandas_version require_minimum_pandas_version() import numpy as np import pandas as pd timezone = self.sql_ctx._conf.sessionLocalTimeZone() if self.sql_ctx._conf.arrowPySparkEnabled(): use_arrow = True try: from pyspark.sql.pandas.types import to_arrow_schema from pyspark.sql.pandas.utils import require_minimum_pyarrow_version require_minimum_pyarrow_version() to_arrow_schema(self.schema) except Exception as e: if self.sql_ctx._conf.arrowPySparkFallbackEnabled(): msg = ( "toPandas attempted Arrow optimization because " "'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, " "failed by the reason below:\n %s\n" "Attempting non-optimization as " "'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to " "true." % str(e)) warnings.warn(msg) use_arrow = False else: msg = ( "toPandas attempted Arrow optimization because " "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has " "reached the error below and will not continue because automatic fallback " "with 'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to " "false.\n %s" % str(e)) warnings.warn(msg) raise # Try to use Arrow optimization when the schema is supported and the required version # of PyArrow is found, if 'spark.sql.execution.arrow.pyspark.enabled' is enabled. if use_arrow: try: from pyspark.sql.pandas.types import _check_series_localize_timestamps, \ _convert_map_items_to_dict import pyarrow # Rename columns to avoid duplicated column names. tmp_column_names = [ 'col_{}'.format(i) for i in range(len(self.columns)) ] self_destruct = self.sql_ctx._conf.arrowPySparkSelfDestructEnabled( ) batches = self.toDF(*tmp_column_names)._collect_as_arrow( split_batches=self_destruct) if len(batches) > 0: table = pyarrow.Table.from_batches(batches) # Ensure only the table has a reference to the batches, so that # self_destruct (if enabled) is effective del batches # Pandas DataFrame created from PyArrow uses datetime64[ns] for date type # values, but we should use datetime.date to match the behavior with when # Arrow optimization is disabled. pandas_options = {'date_as_object': True} if self_destruct: # Configure PyArrow to use as little memory as possible: # self_destruct - free columns as they are converted # split_blocks - create a separate Pandas block for each column # use_threads - convert one column at a time pandas_options.update({ 'self_destruct': True, 'split_blocks': True, 'use_threads': False, }) pdf = table.to_pandas(**pandas_options) # Rename back to the original column names. pdf.columns = self.columns for field in self.schema: if isinstance(field.dataType, TimestampType): pdf[field.name] = \ _check_series_localize_timestamps(pdf[field.name], timezone) elif isinstance(field.dataType, MapType): pdf[field.name] = \ _convert_map_items_to_dict(pdf[field.name]) return pdf else: return pd.DataFrame.from_records([], columns=self.columns) except Exception as e: # We might have to allow fallback here as well but multiple Spark jobs can # be executed. So, simply fail in this case for now. msg = ( "toPandas attempted Arrow optimization because " "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has " "reached the error below and can not continue. Note that " "'spark.sql.execution.arrow.pyspark.fallback.enabled' does not have an " "effect on failures in the middle of " "computation.\n %s" % str(e)) warnings.warn(msg) raise # Below is toPandas without Arrow optimization. pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns) column_counter = Counter(self.columns) dtype = [None] * len(self.schema) for fieldIdx, field in enumerate(self.schema): # For duplicate column name, we use `iloc` to access it. if column_counter[field.name] > 1: pandas_col = pdf.iloc[:, fieldIdx] else: pandas_col = pdf[field.name] pandas_type = PandasConversionMixin._to_corrected_pandas_type( field.dataType) # SPARK-21766: if an integer field is nullable and has null values, it can be # inferred by pandas as float column. Once we convert the column with NaN back # to integer type e.g., np.int16, we will hit exception. So we use the inferred # float type, not the corrected type from the schema in this case. if pandas_type is not None and \ not(isinstance(field.dataType, IntegralType) and field.nullable and pandas_col.isnull().any()): dtype[fieldIdx] = pandas_type # Ensure we fall back to nullable numpy types, even when whole column is null: if isinstance(field.dataType, IntegralType) and pandas_col.isnull().any(): dtype[fieldIdx] = np.float64 if isinstance(field.dataType, BooleanType) and pandas_col.isnull().any(): dtype[fieldIdx] = np.object df = pd.DataFrame() for index, t in enumerate(dtype): column_name = self.schema[index].name # For duplicate column name, we use `iloc` to access it. if column_counter[column_name] > 1: series = pdf.iloc[:, index] else: series = pdf[column_name] if t is not None: series = series.astype(t, copy=False) # `insert` API makes copy of data, we only do it for Series of duplicate column names. # `pdf.iloc[:, index] = pdf.iloc[:, index]...` doesn't always work because `iloc` could # return a view or a copy depending by context. if column_counter[column_name] > 1: df.insert(index, column_name, series, allow_duplicates=True) else: df[column_name] = series pdf = df if timezone is None: return pdf else: from pyspark.sql.pandas.types import _check_series_convert_timestamps_local_tz for field in self.schema: # TODO: handle nested timestamps, such as ArrayType(TimestampType())? if isinstance(field.dataType, TimestampType): pdf[field.name] = \ _check_series_convert_timestamps_local_tz(pdf[field.name], timezone) return pdf
def toPandas(self): """ Returns the contents of this :class:`DataFrame` as Pandas ``pandas.DataFrame``. This is only available if Pandas is installed and available. .. note:: This method should only be used if the resulting Pandas's :class:`DataFrame` is expected to be small, as all the data is loaded into the driver's memory. .. note:: Usage with spark.sql.execution.arrow.pyspark.enabled=True is experimental. >>> df.toPandas() # doctest: +SKIP age name 0 2 Alice 1 5 Bob """ from pyspark.sql.dataframe import DataFrame assert isinstance(self, DataFrame) from pyspark.sql.pandas.utils import require_minimum_pandas_version require_minimum_pandas_version() import numpy as np import pandas as pd timezone = self.sql_ctx._conf.sessionLocalTimeZone() if self.sql_ctx._conf.arrowPySparkEnabled(): use_arrow = True try: from pyspark.sql.pandas.types import to_arrow_schema from pyspark.sql.pandas.utils import require_minimum_pyarrow_version require_minimum_pyarrow_version() to_arrow_schema(self.schema) except Exception as e: if self.sql_ctx._conf.arrowPySparkFallbackEnabled(): msg = ( "toPandas attempted Arrow optimization because " "'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, " "failed by the reason below:\n %s\n" "Attempting non-optimization as " "'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to " "true." % _exception_message(e)) warnings.warn(msg) use_arrow = False else: msg = ( "toPandas attempted Arrow optimization because " "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has " "reached the error below and will not continue because automatic fallback " "with 'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to " "false.\n %s" % _exception_message(e)) warnings.warn(msg) raise # Try to use Arrow optimization when the schema is supported and the required version # of PyArrow is found, if 'spark.sql.execution.arrow.pyspark.enabled' is enabled. if use_arrow: try: from pyspark.sql.pandas.types import _check_dataframe_localize_timestamps import pyarrow batches = self._collect_as_arrow() if len(batches) > 0: table = pyarrow.Table.from_batches(batches) # Pandas DataFrame created from PyArrow uses datetime64[ns] for date type # values, but we should use datetime.date to match the behavior with when # Arrow optimization is disabled. pdf = table.to_pandas(date_as_object=True) return _check_dataframe_localize_timestamps( pdf, timezone) else: return pd.DataFrame.from_records([], columns=self.columns) except Exception as e: # We might have to allow fallback here as well but multiple Spark jobs can # be executed. So, simply fail in this case for now. msg = ( "toPandas attempted Arrow optimization because " "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has " "reached the error below and can not continue. Note that " "'spark.sql.execution.arrow.pyspark.fallback.enabled' does not have an " "effect on failures in the middle of " "computation.\n %s" % _exception_message(e)) warnings.warn(msg) raise # Below is toPandas without Arrow optimization. pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns) dtype = {} for field in self.schema: pandas_type = PandasConversionMixin._to_corrected_pandas_type( field.dataType) # SPARK-21766: if an integer field is nullable and has null values, it can be # inferred by pandas as float column. Once we convert the column with NaN back # to integer type e.g., np.int16, we will hit exception. So we use the inferred # float type, not the corrected type from the schema in this case. if pandas_type is not None and \ not(isinstance(field.dataType, IntegralType) and field.nullable and pdf[field.name].isnull().any()): dtype[field.name] = pandas_type # Ensure we fall back to nullable numpy types, even when whole column is null: if isinstance(field.dataType, IntegralType) and pdf[field.name].isnull().any(): dtype[field.name] = np.float64 if isinstance(field.dataType, BooleanType) and pdf[field.name].isnull().any(): dtype[field.name] = np.object for f, t in dtype.items(): pdf[f] = pdf[f].astype(t, copy=False) if timezone is None: return pdf else: from pyspark.sql.pandas.types import _check_series_convert_timestamps_local_tz for field in self.schema: # TODO: handle nested timestamps, such as ArrayType(TimestampType())? if isinstance(field.dataType, TimestampType): pdf[field.name] = \ _check_series_convert_timestamps_local_tz(pdf[field.name], timezone) return pdf