def test_serialize_read_concatenated_records(): # ARROW-1996 -- see stream alignment work in ARROW-2840, ARROW-3212 f = pa.BufferOutputStream() pa.serialize_to(12, f) pa.serialize_to(23, f) buf = f.getvalue() f = pa.BufferReader(buf) pa.read_serialized(f).deserialize() pa.read_serialized(f).deserialize()
def test_serialize_read_concatenated_records(): # ARROW-1996 -- see stream alignment work in ARROW-2840, ARROW-3212 f = pa.BufferOutputStream() pa.serialize_to(12, f) pa.serialize_to(23, f) buf = f.getvalue() f = pa.BufferReader(buf) pa.read_serialized(f).deserialize() pa.read_serialized(f).deserialize()
def get_results(self, timeout=None): """Returns results from worker pool :param timeout: If None, will block forever, otherwise will raise :class:`.TimeoutWaitingForResultError` exception if no data received within the timeout (in seconds) :return: arguments passed to ``publish_func(...)`` by a worker. If no more results are anticipated, :class:`.EmptyResultError` is raised. """ while True: # If there is no more work to do, raise an EmptyResultError if self._ventilated_items == self._ventilated_items_processed: # We also need to check if we are using a ventilator and if it is completed if not self._ventilator or self._ventilator.completed(): raise EmptyResultError() socks = self._results_receiver_poller.poll( timeout * 1e3 if timeout else None) if not socks: raise TimeoutWaitingForResultError() result = self._results_receiver.recv_pyobj(0) if isinstance(result, VentilatedItemProcessedMessage): self._ventilated_items_processed += 1 if self._ventilator: self._ventilator.processed_item() continue if isinstance(result, Exception): self.stop() self.join() raise result else: deserialized_result = pyarrow.read_serialized( result).deserialize() return deserialized_result
def _load_file(self, file_path): if self.ext == ".json": return _read_json_file(file_path) elif self.ext == ".csv": return pd.read_csv(file_path, index_col=0, parse_dates=True) elif self.ext == ".pkl": with open(file_path, "rb") as f: return pickle.load(f) elif self.fname.endswith(".pa"): return pa.read_serialized(pa.OSFile(file_path, "rb")).deserialize() else: raise Exception("Unknown file type")
def test_serialization_deprecated(): with pytest.warns(FutureWarning): ser = pa.serialize(1) with pytest.warns(FutureWarning): pa.deserialize(ser.to_buffer()) f = pa.BufferOutputStream() with pytest.warns(FutureWarning): pa.serialize_to(12, f) buf = f.getvalue() f = pa.BufferReader(buf) with pytest.warns(FutureWarning): pa.read_serialized(f).deserialize() with pytest.warns(FutureWarning): pa.default_serialization_context() context = pa.lib.SerializationContext() with pytest.warns(FutureWarning): pa.register_default_serialization_handlers(context)
def test_numpy_base_object(tmpdir): # ARROW-2040: deserialized Numpy array should keep a reference to the # owner of its memory path = os.path.join(str(tmpdir), 'zzz.bin') data = np.arange(12, dtype=np.int32) with open(path, 'wb') as f: f.write(pa.serialize(data).to_buffer()) serialized = pa.read_serialized(pa.OSFile(path)) result = serialized.deserialize() assert_equal(result, data) serialized = None assert_equal(result, data) assert result.base is not None
def test_numpy_base_object(tmpdir): # ARROW-2040: deserialized Numpy array should keep a reference to the # owner of its memory path = os.path.join(str(tmpdir), 'zzz.bin') data = np.arange(12, dtype=np.int32) with open(path, 'wb') as f: f.write(pa.serialize(data).to_buffer()) serialized = pa.read_serialized(pa.OSFile(path)) result = serialized.deserialize() assert_equal(result, data) serialized = None assert_equal(result, data) assert result.base is not None
def test_numpy_matrix_serialization(tmpdir): class CustomType(object): def __init__(self, val): self.val = val path = os.path.join(str(tmpdir), 'pyarrow_npmatrix_serialization_test.bin') array = np.random.randint(low=-1, high=1, size=(2, 2)) for data_type in [str, int, float, CustomType]: matrix = np.matrix(array.astype(data_type)) with open(path, 'wb') as f: f.write(pa.serialize(matrix).to_buffer()) serialized = pa.read_serialized(pa.OSFile(path)) result = serialized.deserialize() assert_equal(result, matrix) assert_equal(result.dtype, matrix.dtype) serialized = None assert_equal(result, matrix) assert result.base is not None
def test_numpy_matrix_serialization(tmpdir): class CustomType(object): def __init__(self, val): self.val = val path = os.path.join(str(tmpdir), 'pyarrow_npmatrix_serialization_test.bin') array = np.random.randint(low=-1, high=1, size=(2, 2)) for data_type in [str, int, float, CustomType]: matrix = np.matrix(array.astype(data_type)) with open(path, 'wb') as f: f.write(pa.serialize(matrix).to_buffer()) serialized = pa.read_serialized(pa.OSFile(path)) result = serialized.deserialize() assert_equal(result, matrix) assert_equal(result.dtype, matrix.dtype) serialized = None assert_equal(result, matrix) assert result.base is not None
def fast_read_state_dict(path): serialized = pyarrow.read_serialized(path) sd = serialized.deserialize() return {k: torch.Tensor(v) for k, v in sd}