示例#1
0
 def load_stream(self, stream):
     """
     Deserialize ArrowRecordBatches to an Arrow table and return as a list of pandas.Series.
     """
     from pyspark.sql.types import _check_dataframe_localize_timestamps
     import pyarrow as pa
     reader = pa.open_stream(stream)
     for batch in reader:
         # NOTE: changed from pa.Columns.to_pandas, timezone issue in conversion fixed in 0.7.1
         pdf = _check_dataframe_localize_timestamps(batch.to_pandas())
         yield [c for _, c in pdf.iteritems()]
示例#2
0
 def load_stream(self, stream):
     """
     Deserialize ArrowRecordBatches to an Arrow table and return as a list of pandas.Series.
     """
     from pyspark.sql.types import _check_dataframe_localize_timestamps
     import pyarrow as pa
     reader = pa.open_stream(stream)
     for batch in reader:
         # NOTE: changed from pa.Columns.to_pandas, timezone issue in conversion fixed in 0.7.1
         pdf = _check_dataframe_localize_timestamps(batch.to_pandas(), self._timezone)
         yield [c for _, c in pdf.iteritems()]
示例#3
0
    def arrow_to_pandas(self, arrow_column, data_type):
        from pyspark.sql.types import StructType, \
            _arrow_column_to_pandas, _check_dataframe_localize_timestamps

        if self._df_for_struct and type(data_type) == StructType:
            import pandas as pd
            series = [_arrow_column_to_pandas(column, field.dataType).rename(field.name)
                      for column, field in zip(arrow_column.flatten(), data_type)]
            s = _check_dataframe_localize_timestamps(pd.concat(series, axis=1), self._timezone)
        else:
            s = super(ArrowStreamPandasUDFSerializer, self).arrow_to_pandas(arrow_column, data_type)
        return s
示例#4
0
 def load_stream(self, stream):
     """
     Deserialize ArrowRecordBatches to an Arrow table and return as a list of pandas.Series.
     """
     from pyspark.sql.types import from_arrow_schema, _check_dataframe_convert_date, \
         _check_dataframe_localize_timestamps
     import pyarrow as pa
     reader = pa.open_stream(stream)
     schema = from_arrow_schema(reader.schema)
     for batch in reader:
         pdf = batch.to_pandas()
         pdf = _check_dataframe_convert_date(pdf, schema)
         pdf = _check_dataframe_localize_timestamps(pdf, self._timezone)
         yield [c for _, c in pdf.iteritems()]
示例#5
0
    def arrow_to_pandas(self, arrow_column, data_type):
        from pyspark.sql.types import StructType, \
            _arrow_column_to_pandas, _check_dataframe_localize_timestamps

        if self._df_for_struct and type(data_type) == StructType:
            import pandas as pd
            series = [
                _arrow_column_to_pandas(column,
                                        field.dataType).rename(field.name)
                for column, field in zip(arrow_column.flatten(), data_type)
            ]
            s = _check_dataframe_localize_timestamps(pd.concat(series, axis=1),
                                                     self._timezone)
        else:
            s = super(ArrowStreamPandasUDFSerializer,
                      self).arrow_to_pandas(arrow_column, data_type)
        return s