def test_buffer_slicing(): data = b'some data!' buf = pa.py_buffer(data) sliced = buf.slice(2) expected = pa.py_buffer(b'me data!') assert sliced.equals(expected) sliced2 = buf.slice(2, 4) expected2 = pa.py_buffer(b'me d') assert sliced2.equals(expected2) # 0 offset assert buf.slice(0).equals(buf) # Slice past end of buffer assert len(buf.slice(len(buf))) == 0 with pytest.raises(IndexError): buf.slice(-1) # Test slice notation assert buf[2:].equals(buf.slice(2)) assert buf[2:5].equals(buf.slice(2, 3)) assert buf[-5:].equals(buf.slice(len(buf) - 5)) with pytest.raises(IndexError): buf[::-1] with pytest.raises(IndexError): buf[::2] n = len(buf) for start in range(-n * 2, n * 2): for stop in range(-n * 2, n * 2): assert buf[start:stop].to_pybytes() == buf.to_pybytes()[start:stop]
def test_buffer_equals(): # Buffer.equals() returns true iff the buffers have the same contents def eq(a, b): assert a.equals(b) assert a == b assert not (a != b) def ne(a, b): assert not a.equals(b) assert not (a == b) assert a != b b1 = b'some data!' b2 = bytearray(b1) b3 = bytearray(b1) b3[0] = 42 buf1 = pa.py_buffer(b1) buf2 = pa.py_buffer(b2) buf3 = pa.py_buffer(b2) buf4 = pa.py_buffer(b3) buf5 = pa.py_buffer(np.frombuffer(b2, dtype=np.int16)) eq(buf1, buf1) eq(buf1, buf2) eq(buf2, buf3) ne(buf2, buf4) # Data type is indifferent eq(buf2, buf5)
def test_buffer_from_numpy(): # C-contiguous arr = np.arange(12, dtype=np.int8).reshape((3, 4)) buf = pa.py_buffer(arr) assert buf.to_pybytes() == arr.tobytes() # F-contiguous; note strides informations is lost buf = pa.py_buffer(arr.T) assert buf.to_pybytes() == arr.tobytes() # Non-contiguous with pytest.raises(ValueError, match="not contiguous"): buf = pa.py_buffer(arr.T[::2])
def test_compress_decompress(): INPUT_SIZE = 10000 test_data = (np.random.randint(0, 255, size=INPUT_SIZE) .astype(np.uint8) .tostring()) test_buf = pa.py_buffer(test_data) codecs = ['lz4', 'snappy', 'gzip', 'zstd', 'brotli'] for codec in codecs: compressed_buf = pa.compress(test_buf, codec=codec) compressed_bytes = pa.compress(test_data, codec=codec, asbytes=True) assert isinstance(compressed_bytes, bytes) decompressed_buf = pa.decompress(compressed_buf, INPUT_SIZE, codec=codec) decompressed_bytes = pa.decompress(compressed_bytes, INPUT_SIZE, codec=codec, asbytes=True) assert isinstance(decompressed_bytes, bytes) assert decompressed_buf.equals(test_buf) assert decompressed_bytes == test_data with pytest.raises(ValueError): pa.decompress(compressed_bytes, codec=codec)
def test_context_from_object(size): ctx = global_context arr, cbuf = make_random_buffer(size, target='device') dtype = arr.dtype # Creating device buffer from a CUDA host buffer hbuf = cuda.new_host_buffer(size * arr.dtype.itemsize) np.frombuffer(hbuf, dtype=dtype)[:] = arr cbuf2 = ctx.buffer_from_object(hbuf) assert cbuf2.size == cbuf.size arr2 = np.frombuffer(cbuf2.copy_to_host(), dtype=dtype) np.testing.assert_equal(arr, arr2) # Creating device buffer from a device buffer cbuf2 = ctx.buffer_from_object(cbuf2) assert cbuf2.size == cbuf.size arr2 = np.frombuffer(cbuf2.copy_to_host(), dtype=dtype) np.testing.assert_equal(arr, arr2) # Trying to create a device buffer from a Buffer with pytest.raises(pa.ArrowTypeError, match=('buffer is not backed by a CudaBuffer')): ctx.buffer_from_object(pa.py_buffer(b"123")) # Trying to create a device buffer from numpy.array with pytest.raises(pa.ArrowTypeError, match=('cannot create device buffer view from' ' `<class \'numpy.ndarray\'>` object')): ctx.buffer_from_object(np.array([1, 2, 3]))
def test_buffer_protocol_respects_immutability(): # ARROW-3228; NumPy's frombuffer ctor determines whether a buffer-like # object is mutable by first attempting to get a mutable buffer using # PyObject_FromBuffer. If that fails, it assumes that the object is # immutable a = b'12345' arrow_ref = pa.py_buffer(a) numpy_ref = np.frombuffer(arrow_ref, dtype=np.uint8) assert not numpy_ref.flags.writeable
def test_array_from_buffers(): values_buf = pa.py_buffer(np.int16([4, 5, 6, 7])) nulls_buf = pa.py_buffer(np.uint8([0b00001101])) arr = pa.Array.from_buffers(pa.int16(), 4, [nulls_buf, values_buf]) assert arr.type == pa.int16() assert arr.to_pylist() == [4, None, 6, 7] arr = pa.Array.from_buffers(pa.int16(), 4, [None, values_buf]) assert arr.type == pa.int16() assert arr.to_pylist() == [4, 5, 6, 7] arr = pa.Array.from_buffers(pa.int16(), 3, [nulls_buf, values_buf], offset=1) assert arr.type == pa.int16() assert arr.to_pylist() == [None, 6, 7] with pytest.raises(TypeError): pa.Array.from_buffers(pa.int16(), 3, [u'', u''], offset=1)
def test_buffer_address(): b1 = b'some data!' b2 = bytearray(b1) b3 = bytearray(b1) buf1 = pa.py_buffer(b1) buf2 = pa.py_buffer(b1) buf3 = pa.py_buffer(b2) buf4 = pa.py_buffer(b3) assert buf1.address > 0 assert buf1.address == buf2.address assert buf3.address != buf2.address assert buf4.address != buf3.address arr = np.arange(5) buf = pa.py_buffer(arr) assert buf.address == arr.ctypes.data
def test_buffer_memoryview(): val = b'some data' buf = pa.py_buffer(val) assert isinstance(buf, pa.Buffer) assert not buf.is_mutable result = memoryview(buf) assert result == val
def test_buffer_bytearray(): val = bytearray(b'some data') buf = pa.py_buffer(val) assert isinstance(buf, pa.Buffer) assert buf.is_mutable result = bytearray(buf) assert result == val
def test_buffer_to_numpy(): # Make sure creating a numpy array from an arrow buffer works byte_array = bytearray(20) byte_array[0] = 42 buf = pa.py_buffer(byte_array) array = np.frombuffer(buf, dtype="uint8") assert array[0] == byte_array[0] byte_array[0] += 1 assert array[0] == byte_array[0] assert array.base == buf
def test_buffer_bytes(): val = b'some data' buf = pa.py_buffer(val) assert isinstance(buf, pa.Buffer) assert not buf.is_mutable result = buf.to_pybytes() assert result == val
def test_output_stream_buffer(): data = b"some test data\n" * 10 + b"eof\n" buf = bytearray(len(data)) stream = pa.output_stream(pa.py_buffer(buf)) stream.write(data) assert buf == data buf = bytearray(len(data)) stream = pa.output_stream(memoryview(buf)) stream.write(data) assert buf == data
def test_input_stream_buffer(): data = b"some test data\n" * 10 + b"eof\n" for arg in [pa.py_buffer(data), memoryview(data)]: stream = pa.input_stream(arg) assert stream.read() == data gz_data = gzip_compress(data) stream = pa.input_stream(memoryview(gz_data)) assert stream.read() == gz_data stream = pa.input_stream(memoryview(gz_data), compression='gzip') assert stream.read() == data
def test_uninitialized_buffer(): # ARROW-2039: calling Buffer() directly creates an uninitialized object check_uninitialized = partial(pytest.raises, ReferenceError, match="uninitialized") buf = pa.Buffer() with check_uninitialized(): buf.size with check_uninitialized(): len(buf) with check_uninitialized(): buf.is_mutable with check_uninitialized(): buf.parent with check_uninitialized(): buf.to_pybytes() with check_uninitialized(): memoryview(buf) with check_uninitialized(): buf.equals(pa.py_buffer(b'')) with check_uninitialized(): pa.py_buffer(b'').equals(buf)
def test_buffer_getitem(): data = bytearray(b'some data!') buf = pa.py_buffer(data) n = len(data) for ix in range(-n, n - 1): assert buf[ix] == data[ix] with pytest.raises(IndexError): buf[n] with pytest.raises(IndexError): buf[-n - 1]
def do_action(self, context, action): if action.type == "clear": raise NotImplementedError( "{} is not implemented.".format(action.type)) elif action.type == "healthcheck": pass elif action.type == "shutdown": yield pyarrow.flight.Result(pyarrow.py_buffer(b'Shutdown!')) # Shut down on background thread to avoid blocking current # request threading.Thread(target=self._shutdown).start() else: raise KeyError(f"Unknown action {action.type!r}")
def test_deserialize_buffer_in_different_process(): import tempfile f = tempfile.NamedTemporaryFile(delete=False) b = pa.serialize(pa.py_buffer(b'hello')).to_buffer() f.write(b.to_pybytes()) f.close() subprocess_env = test_util.get_modified_env_with_pythonpath() dir_path = os.path.dirname(os.path.realpath(__file__)) python_file = os.path.join(dir_path, 'deserialize_buffer.py') subprocess.check_call([sys.executable, python_file, f.name], env=subprocess_env)
def test_buffer_bytes(): val = b'some data' buf = pa.py_buffer(val) assert isinstance(buf, pa.Buffer) assert not buf.is_mutable result = buf.to_pybytes() assert result == val # Check that buffers survive a pickle roundtrip result_buf = pickle.loads(pickle.dumps(buf)) result = result_buf.to_pybytes() assert result == val
def test_buffer_bytes(): val = b'some data' buf = pa.py_buffer(val) assert isinstance(buf, pa.Buffer) assert not buf.is_mutable assert buf.is_cpu result = buf.to_pybytes() assert result == val # Check that buffers survive a pickle roundtrip result_buf = pickle.loads(pickle.dumps(buf)) result = result_buf.to_pybytes() assert result == val
def to_arrow(self): offsets = self.offsets.to_arrow() elements = (pa.nulls(len(self.elements)) if len(self.elements) == self.elements.null_count else self.elements.to_arrow()) pa_type = pa.list_(elements.type) if self.nullable: nbuf = self.mask.to_host_array().view("int8") nbuf = pa.py_buffer(nbuf) buffers = (nbuf, offsets.buffers()[1]) else: buffers = offsets.buffers() return pa.ListArray.from_buffers(pa_type, len(self), buffers, children=[elements])
def test_buffer_memoryview_is_immutable(): val = b'some data' buf = pa.py_buffer(val) assert not buf.is_mutable assert isinstance(buf, pa.Buffer) result = memoryview(buf) assert result.readonly with pytest.raises(TypeError) as exc: result[0] = b'h' assert 'cannot modify read-only' in str(exc.value) b = bytes(buf) with pytest.raises(TypeError) as exc: b[0] = b'h' assert 'cannot modify read-only' in str(exc.value)
def _apply_binary_str_array(a: pa.Array, b: pa.Array, *, func: Callable, output_dtype, parallel: bool = False): out = np.empty(len(a), dtype=output_dtype) offsets_buffer_a, data_buffer_a = _extract_string_buffers(a) offsets_buffer_b, data_buffer_b = _extract_string_buffers(b) if a.null_count == 0 and b.null_count == 0: if parallel: call = _apply_no_nulls_parallel else: call = _apply_no_nulls call( func, len(a), offsets_buffer_a, data_buffer_a, offsets_buffer_b, data_buffer_b, out, ) return pa.array(out) else: valid = _merge_valid_bitmaps(a, b) if parallel: call = _apply_with_nulls_parallel else: call = _apply_with_nulls call( func, len(a), valid, offsets_buffer_a, data_buffer_a, offsets_buffer_b, data_buffer_b, out, ) buffers = [pa.py_buffer(x) for x in [valid, out]] return pa.Array.from_buffers(pa.int64(), len(out), buffers)
def adql_query(self, query_str, download=False, threads=1): """ Query the ASAS-SN Sky Patrol Input Catalogs with an ADQL string. See README.md for more on accepted ADQL context and functions. :param query_str: ADQL query string :param download: return full light curves if True, return catalog information if False :param threads: number of real threads to use for pulling light curves from server. :return: if 'download' if False; pandas Dataframe containing catalog information of targets; else LightCurveCollection """ # Check inputs if type(download) is not bool: raise ValueError("'download' must be boolean value") if type(threads) is not int: raise ValueError("'threads' must be integer value") if type(query_str) is not str: raise ValueError("'query_str' must me string value") # Trim ADQL input query_str = re.sub(' +', ' ', query_str).replace("\n", "") query_bytes = encodebytes(bytes(query_str, encoding='utf-8')).decode() # Query Flask API with SQL bytes url = f"http://asassn-lb01.ifa.hawaii.edu:9006/lookup_sql/{query_bytes}" response = requests.post(url, json={'format': 'arrow'}) # Check response if response.status_code == 400: error = json.loads(response.content)['error_text'] raise RuntimeError(error) # Deserialize from arrow buff = pa.py_buffer(response.content) tar_df = pa.deserialize(buff) self.index = tar_df if download is False: return tar_df else: tar_ids = list(tar_df['asas_sn_id']) return self._get_curves(tar_ids, "extrasolar", threads)
def test_config(self): myconfig = Configuration() myconfig.uuid = str(uuid.uuid4()) myconfig.name = f"{myconfig.uuid}.config.dat" configinfo = ConfigObjectInfo() configinfo.created.GetCurrentTime() bufconfig = pa.py_buffer(myconfig.SerializeToString()) with tempfile.TemporaryDirectory() as dirpath: _path = dirpath + "/test" store = BaseObjectStore( str(_path), "test") # wrapper to the CronusStore message config_uuid = store.register_content(myconfig, configinfo).uuid store.put(config_uuid, myconfig) aconfig = Configuration() store.get(config_uuid, aconfig) self.assertEqual(myconfig.name, aconfig.name) self.assertEqual(myconfig.uuid, aconfig.uuid)
def test_serialize_to_components_invalid_cases(): buf = pa.py_buffer(b'hello') components = { 'num_tensors': 0, 'num_buffers': 1, 'data': [buf] } with pytest.raises(pa.ArrowInvalid): pa.deserialize_components(components) components = { 'num_tensors': 1, 'num_buffers': 0, 'data': [buf, buf] } with pytest.raises(pa.ArrowInvalid): pa.deserialize_components(components)
def sampler(self): while self.nsamples > 0: self.__logger.info("%s: Generating datum " % (self.__class__.__name__)) data = self.gen_chunk() self.__logger.debug( "%s: type data: %s" % (self.__class__.__name__, type(data)) ) fileinfo = FileObjectInfo() fileinfo.type = 2 fileinfo.partition = self.name job_id = f"{self.gate.meta.job_id}_sample_{self.nsamples}" ds_id = self.gate.meta.parentset_id id_ = self.gate.store.register_content( data, fileinfo, dataset_id=ds_id, partition_key=self.name, job_id=job_id ).uuid buf = pa.py_buffer(data) self.gate.store.put(id_, buf) yield id_ self.nsamples -= 1 self.__logger.debug("Batch %i", self.nsamples)
def create_reader(url, compression=None): parts = urllib.parse.urlparse(url) # S3 if parts.scheme == 's3': bucket = parts.netloc key = parts.path[1:] obj = Driver.s3_client().get_object(Bucket=bucket, Key=key) buf = obj['Body'].read() strm = pyarrow.input_stream(pyarrow.py_buffer(buf), compression=compression) return pyarrow.RecordBatchStreamReader(strm) # File System elif parts.scheme == 'file': path = os.path.join(parts.netloc, parts.path) strm = pyarrow.input_stream(path, compression=compression) return pyarrow.RecordBatchStreamReader(strm) else: raise Exception('URL {} not supported'.format(url))
def test_menu(self): testmenu = Menu_pb() print(type(testmenu)) print(testmenu) testmenu.uuid = str(uuid.uuid4()) testmenu.name = f"{testmenu.uuid}.menu.dat" menuinfo = MenuObjectInfo() menuinfo.created.GetCurrentTime() bufmenu = pa.py_buffer(testmenu.SerializeToString()) with tempfile.TemporaryDirectory() as dirpath: _path = dirpath + "/test" store = BaseObjectStore( str(_path), "test") # wrapper to the CronusStore message menu_uuid = store.register_content(testmenu, menuinfo).uuid store.put(menu_uuid, testmenu) amenu = Menu_pb() store.get(menu_uuid, amenu) self.assertEqual(testmenu.name, amenu.name) self.assertEqual(testmenu.uuid, amenu.uuid)
def all_true_like(arr: pa.Array) -> pa.Array: """Return a boolean array with all-True with the same size as the input and the same valid bitmap.""" valid_buffer = arr.buffers()[0] if valid_buffer: valid_buffer = valid_buffer.slice(arr.offset // 8) output_offset = arr.offset % 8 output_length = len(arr) + output_offset output_size = output_length // 8 if output_length % 8 > 0: output_size += 1 output = np.full(output_size, fill_value=255, dtype=np.uint8) return pa.Array.from_buffers( pa.bool_(), len(arr), [valid_buffer, pa.py_buffer(output)], arr.null_count, output_offset, )
def _text_contains_case_sensitive(data: pa.Array, pat: str) -> pa.Array: """ Check for each element in the data whether it contains the pattern ``pat``. This implementation does basic byte-by-byte comparison and is independent of any locales or encodings. """ # Convert to UTF-8 bytes pat_bytes: bytes = pat.encode() # Initialise boolean (bit-packaed) output array. output_size = len(data) // 8 if len(data) % 8 > 0: output_size += 1 output = np.empty(output_size, dtype=np.uint8) if len(data) % 8 > 0: # Zero trailing bits output[-1] = 0 offsets, data_buffer = _extract_string_buffers(data) if data.null_count == 0: valid_buffer = None _text_contains_case_sensitive_nonnull( len(data), offsets, data_buffer, pat_bytes, output ) else: valid = _buffer_to_view(data.buffers()[0]) _text_contains_case_sensitive_nulls( len(data), valid, data.offset, offsets, data_buffer, pat_bytes, output ) valid_buffer = data.buffers()[0].slice(data.offset // 8) if data.offset % 8 != 0: valid_buffer = shift_unaligned_bitmap( valid_buffer, data.offset % 8, len(data) ) return pa.Array.from_buffers( pa.bool_(), len(data), [valid_buffer, pa.py_buffer(output)], data.null_count )
def or_na(arr: pa.Array) -> pa.Array: """Apply ``array | NA`` with a boolean pyarrow.Array.""" output_length = len(arr) // 8 if len(arr) % 8 != 0: output_length += 1 if arr.null_count == 0: return pa.Array.from_buffers( pa.bool_(), len(arr), [arr.buffers()[1], arr.buffers()[1]], null_count=-1, offset=arr.offset, ) else: output = np.zeros(output_length, dtype=np.uint8) null_count = _or_na(len(arr), arr.offset, arr.buffers()[0], arr.buffers()[1], output) buf = pa.py_buffer(output) return pa.Array.from_buffers(pa.bool_(), len(arr), [buf, buf], null_count)
def trim_buffers(ar): # there are cases where memcopy are made, of modifications are mode (large_string_to_string) # in those cases, we don't want to work on the full array, and get rid of the offset if possible if ar.type == pa.string() or ar.type == pa.large_string(): if isinstance(ar, pa.ChunkedArray): return ar # lets assume chunked arrays are fine null_bitmap, offsets_buffer, bytes = ar.buffers() if ar.type == pa.string(): offsets = np.frombuffer(offsets_buffer, np.int32, len(ar) + 1 + ar.offset) else: offsets = np.frombuffer(offsets_buffer, np.int64, len(ar) + 1 + ar.offset) # because it is difficult to slice bits new_offset = ar.offset % 8 remove_offset = (ar.offset // 8) * 8 first_offset = offsets[remove_offset] new_offsets = offsets[remove_offset:] - first_offset if null_bitmap: null_bitmap = null_bitmap.slice(ar.offset // 8) new_offsets_buffer = pa.py_buffer(new_offsets) bytes = bytes.slice(first_offset) ar = pa.Array.from_buffers(ar.type, len(ar), [null_bitmap, new_offsets_buffer, bytes], offset=new_offset) return ar
def _make_mask(self): assert self.data.length is not None if self.data.type.equals(pyarrow.null()): self.data.null_count = self.data.length return None if self.data.length == 0: self.data.null_count = 0 return None bits = self._decompress(self._doc[MASK]) vals = numpy.unpackbits(numpy.ndarray(len(bits), numpy.uint8, bits), bitorder='big') self.data.null_count = self.data.length - numpy.sum(vals) if self.data.null_count == 0: return None mask = numpy.packbits(vals, bitorder='little') return pyarrow.py_buffer(mask.tobytes())
def read_vineyard_dataframe(vineyard_socket, path, storage_options, read_options, proc_num, proc_index): client = vineyard.connect(vineyard_socket) builder = DataframeStreamBuilder(client) if storage_options: raise ValueError("Read vineyard current not support storage options") builder["header_row"] = "1" if read_options.get("header_row", False) else "0" builder["delimiter"] = bytes(read_options.get("delimiter", ","), "utf-8").decode("unicode_escape") stream = builder.seal(client) client.persist(stream) ret = {"type": "return", "content": repr(stream.id)} print(json.dumps(ret), flush=True) name = urlparse(path).netloc # the "name" part in URL can be a name, or an ObjectID for convenience. try: df_id = client.get_name(name) except: df_id = vineyard.ObjectID(name) dataframes = client.get(df_id) writer = stream.open_writer(client) for df in dataframes: rb = pa.RecordBatch.from_pandas(df) sink = pa.BufferOutputStream() rb_writer = pa.ipc.new_stream(sink, rb.schema) rb_writer.write_batch(rb) rb_writer.close() buf = sink.getvalue() chunk = writer.next(buf.size) buf_writer = pa.FixedSizeBufferWriter(pa.py_buffer(chunk)) buf_writer.write(buf) buf_writer.close() writer.finish()
def apply(bytes, parameters=None): """ Apply the deserialization to the bytes produced by Pyarrow serialization Parameters -------------- bytes Bytes parameters Parameters of the algorithm Returns -------------- deser Deserialized object """ if parameters is None: parameters = {} buffer = pyarrow.py_buffer(bytes) list_events = pyarrow.deserialize(buffer) for i in range(len(list_events)): list_events[i] = Event(list_events[i]) return EventStream(list_events)
def test_serialize_to_components_invalid_cases(): buf = pa.py_buffer(b'hello') components = { 'num_tensors': 0, 'num_sparse_tensors': {'coo': 0, 'csr': 0}, 'num_ndarrays': 0, 'num_buffers': 1, 'data': [buf] } with pytest.raises(pa.ArrowInvalid): pa.deserialize_components(components) components = { 'num_tensors': 0, 'num_sparse_tensors': {'coo': 0, 'csr': 0}, 'num_ndarrays': 1, 'num_buffers': 0, 'data': [buf, buf] } with pytest.raises(pa.ArrowInvalid): pa.deserialize_components(components)
def serialize_buffer_class(obj): return pa.py_buffer(b"hello")
def test_buffer_hex(val, expected_hex_buffer): buf = pa.py_buffer(val) assert buf.hex() == expected_hex_buffer
def read_bytes(self, b, **kwargs): return self.read_csv(pa.py_buffer(b), **kwargs)
def make_buffer(bytes_obj): return bytearray(pa.py_buffer(bytes_obj))
def test_buffer_invalid(): with pytest.raises(TypeError, match="(bytes-like object|buffer interface)"): pa.py_buffer(None)
def test_buffer_hashing(): # Buffers are unhashable with pytest.raises(TypeError, match="unhashable"): hash(pa.py_buffer(b'123'))