class ArrowCoderImpl(FieldCoderImpl): """ A coder for arrow format data. """ def __init__(self, schema, row_type, timezone): self._schema = schema self._field_types = row_type.field_types() self._timezone = timezone self._resettable_io = ResettableIO() self._batch_reader = ArrowCoderImpl._load_from_stream(self._resettable_io) def encode_to_stream(self, cols, out_stream): self._resettable_io.set_output_stream(out_stream) batch_writer = pa.RecordBatchStreamWriter(self._resettable_io, self._schema) batch_writer.write_batch( pandas_to_arrow(self._schema, self._timezone, self._field_types, cols)) def decode_from_stream(self, in_stream, length=0): return self.decode_one_batch_from_stream(in_stream, length) @staticmethod def _load_from_stream(stream): while stream.readable(): reader = pa.ipc.open_stream(stream) yield reader.read_next_batch() def decode_one_batch_from_stream(self, in_stream: InputStream, size: int) -> List: self._resettable_io.set_input_bytes(in_stream.read(size)) # there is only one arrow batch in the underlying input stream return arrow_to_pandas(self._timezone, self._field_types, [next(self._batch_reader)]) def __repr__(self): return 'ArrowCoderImpl[%s]' % self._schema
class ArrowCoderImpl(StreamCoderImpl): def __init__(self, schema): self._schema = schema self._resettable_io = ResettableIO() self._batch_reader = ArrowCoderImpl._load_from_stream( self._resettable_io) self._batch_writer = pa.RecordBatchStreamWriter( self._resettable_io, self._schema) self.data_out_stream = create_OutputStream() self._resettable_io.set_output_stream(self.data_out_stream) def encode_to_stream(self, iter_cols, out_stream, nested): data_out_stream = self.data_out_stream for cols in iter_cols: self._batch_writer.write_batch(self._create_batch(cols)) out_stream.write_var_int64(data_out_stream.size()) out_stream.write(data_out_stream.get()) data_out_stream._clear() def decode_from_stream(self, in_stream, nested): while in_stream.size() > 0: yield self._decode_one_batch_from_stream(in_stream) @staticmethod def _load_from_stream(stream): reader = pa.ipc.open_stream(stream) for batch in reader: yield batch def _create_batch(self, cols): def create_array(s, t): try: return pa.Array.from_pandas(s, mask=s.isnull(), type=t) except pa.ArrowException as e: error_msg = "Exception thrown when converting pandas.Series (%s) to " \ "pyarrow.Array (%s)." raise RuntimeError(error_msg % (s.dtype, t), e) arrays = [ create_array(cols[i], self._schema.types[i]) for i in range(0, len(self._schema)) ] return pa.RecordBatch.from_arrays(arrays, self._schema) def _decode_one_batch_from_stream(self, in_stream: create_InputStream) -> List: self._resettable_io.set_input_bytes(in_stream.read_all(True)) # there is only one arrow batch in the underlying input stream table = pa.Table.from_batches([next(self._batch_reader)]) return [c.to_pandas(date_as_object=True) for c in table.itercolumns()] def __repr__(self): return 'ArrowCoderImpl[%s]' % self._schema
class ArrowCoderImpl(StreamCoderImpl): def __init__(self, schema, row_type, timezone): self._schema = schema self._field_types = row_type.field_types() self._timezone = timezone self._resettable_io = ResettableIO() self._batch_reader = ArrowCoderImpl._load_from_stream( self._resettable_io) self._batch_writer = pa.RecordBatchStreamWriter( self._resettable_io, self._schema) self.data_out_stream = create_OutputStream() self._resettable_io.set_output_stream(self.data_out_stream) def encode_to_stream(self, iter_cols, out_stream, nested): data_out_stream = self.data_out_stream for cols in iter_cols: self._batch_writer.write_batch( pandas_to_arrow(self._schema, self._timezone, self._field_types, cols)) out_stream.write_var_int64(data_out_stream.size()) out_stream.write(data_out_stream.get()) data_out_stream._clear() def decode_from_stream(self, in_stream, nested): while in_stream.size() > 0: yield self._decode_one_batch_from_stream( in_stream, in_stream.read_var_int64()) @staticmethod def _load_from_stream(stream): reader = pa.ipc.open_stream(stream) for batch in reader: yield batch def _decode_one_batch_from_stream(self, in_stream: create_InputStream, size: int) -> List: self._resettable_io.set_input_bytes(in_stream.read(size)) # there is only one arrow batch in the underlying input stream return arrow_to_pandas(self._timezone, self._field_types, [next(self._batch_reader)]) def __repr__(self): return 'ArrowCoderImpl[%s]' % self._schema
class ArrowCoderImpl(StreamCoderImpl): def __init__(self, schema, row_type, timezone): self._schema = schema self._field_types = row_type.field_types() self._timezone = timezone self._resettable_io = ResettableIO() self._batch_reader = ArrowCoderImpl._load_from_stream( self._resettable_io) self._batch_writer = pa.RecordBatchStreamWriter( self._resettable_io, self._schema) self.data_out_stream = create_OutputStream() self._resettable_io.set_output_stream(self.data_out_stream) def encode_to_stream(self, iter_cols, out_stream, nested): data_out_stream = self.data_out_stream for cols in iter_cols: self._batch_writer.write_batch(self._create_batch(cols)) out_stream.write_var_int64(data_out_stream.size()) out_stream.write(data_out_stream.get()) data_out_stream._clear() def decode_from_stream(self, in_stream, nested): while in_stream.size() > 0: yield self._decode_one_batch_from_stream(in_stream) @staticmethod def _load_from_stream(stream): reader = pa.ipc.open_stream(stream) for batch in reader: yield batch def _create_batch(self, cols): def create_array(s, t): try: return pa.Array.from_pandas(s, mask=s.isnull(), type=t) except pa.ArrowException as e: error_msg = "Exception thrown when converting pandas.Series (%s) to " \ "pyarrow.Array (%s)." raise RuntimeError(error_msg % (s.dtype, t), e) arrays = [ create_array( ArrowCoderImpl.tz_convert_to_internal(cols[i], self._field_types[i], self._timezone), self._schema.types[i]) for i in range(0, len(self._schema)) ] return pa.RecordBatch.from_arrays(arrays, self._schema) def _decode_one_batch_from_stream(self, in_stream: create_InputStream) -> List: self._resettable_io.set_input_bytes(in_stream.read_all(True)) # there is only one arrow batch in the underlying input stream table = pa.Table.from_batches([next(self._batch_reader)]) return [ ArrowCoderImpl.tz_convert_from_internal( c.to_pandas(date_as_object=True), t, self._timezone) for c, t in zip(table.itercolumns(), self._field_types) ] @staticmethod def tz_convert_from_internal(s: pd.Series, t: DataType, local_tz) -> pd.Series: """ Converts the timestamp series from internal according to the specified local timezone. Returns the same series if the series is not a timestamp series. Otherwise, returns a converted series. """ if type(t) == LocalZonedTimestampType: return s.dt.tz_localize(local_tz) else: return s @staticmethod def tz_convert_to_internal(s: pd.Series, t: DataType, local_tz) -> pd.Series: """ Converts the timestamp series to internal according to the specified local timezone. """ from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype if type(t) == LocalZonedTimestampType: if is_datetime64_dtype(s.dtype): return s.dt.tz_localize(None) elif is_datetime64tz_dtype(s.dtype): return s.dt.tz_convert(local_tz).dt.tz_localize(None) return s def __repr__(self): return 'ArrowCoderImpl[%s]' % self._schema