def put_df(df): id_num = df['rIDs'].values[0] id_num = str(id_num) if id_num == '23': id_num = 'X' elif id_num == '24': id_num = 'Y' elif id_num == '25': id_num = 'M' record_batch = pa.RecordBatch.from_pandas(df) record_batch_rows = record_batch.num_rows record_batch_rows_actual = record_batch_rows index = 0 limit = 5714285 check = False print(record_batch_rows_actual) i = 0 while record_batch_rows > limit: check = True record_batch_rows = record_batch_rows - limit record_batch_slice = record_batch.slice(index, limit) index = index + limit # Get size of record batch and schema mock_sink = pa.MockOutputStream() stream_writer = pa.RecordBatchStreamWriter(mock_sink, record_batch_slice.schema) stream_writer.write_batch(record_batch_slice) data_size = mock_sink.size() # Generate an ID and allocate a buffer in the object store for the # serialized DataFrame object_id = plasma.ObjectID(''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(20))) #print(id_num) buf = client.create(object_id, data_size) # Write the serialized DataFrame to the object store sink = pa.FixedSizeBufferWriter(buf) stream_writer = pa.RecordBatchStreamWriter(sink, record_batch_slice.schema) stream_writer.write_batch(record_batch_slice) # Seal the object client.seal(object_id) f = open("/home/tahmad/bulk/apps/objIDsPy.txt", "a") f.write('Chr' + id_num + '_' + str(i) + '\t' + object_id.binary() + '\n') f.close() i = i + 1 if check == True: record_batch = record_batch.slice(index, record_batch_rows) # Get size of record batch and schema mock_sink = pa.MockOutputStream() stream_writer = pa.RecordBatchStreamWriter(mock_sink, record_batch.schema) stream_writer.write_batch(record_batch) data_size = mock_sink.size() # Generate an ID and allocate a buffer in the object store for the # serialized DataFrame object_id = plasma.ObjectID(''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(20))) #print(id_num) buf = client.create(object_id, data_size) # Write the serialized DataFrame to the object store sink = pa.FixedSizeBufferWriter(buf) stream_writer = pa.RecordBatchStreamWriter(sink, record_batch.schema) stream_writer.write_batch(record_batch) # Seal the object client.seal(object_id) #get_df(object_id) #Loopback return object_id, id_num
def __init__(self, vineyard_client, session_id, data_key, data_id, mode='w', nbytes=None, packed=False, compress=None, auto_register=True, pin_token=None, handler=None): from .objectholder import SharedHolderActor logger.debug( 'create vineyard bytes IO: mode = %s, packed = %s, compress = %r', mode, packed, compress) super().__init__(session_id, data_key, mode=mode, handler=handler) self._client = vineyard_client self._data_id = data_id self._buffer = None self._offset = 0 self._nbytes = nbytes self._holder_ref = self._storage_ctx.actor_ctx.actor_ref( SharedHolderActor.default_uid()) self._compress = compress or dataserializer.CompressType.NONE self._packed = packed self._auto_register = auto_register self._pin_token = pin_token block_size = options.worker.copy_block_size if self.is_writable: logger.debug( 'bytes io write: session_id = %s, data_key = %s, size = %d', session_id, data_key, nbytes) self._buffer = pyarrow.allocate_buffer(nbytes, resizable=False) if packed: self._buf = ArrowBufferIO(self._buffer, 'w', block_size=block_size) else: self._buf = pyarrow.FixedSizeBufferWriter(self._buffer) self._buf.set_memcopy_threads(6) elif self.is_readable: logger.debug( 'bytes io get: session_id = %s, data_key = %s, data_id = %r', session_id, data_key, data_id) data = self._client.get(data_id) self._buffer = pyarrow.serialize( data, dataserializer.mars_serialize_context()).to_buffer() if packed: self._buf = ArrowBufferIO(self._buffer, 'r', compress_out=compress, block_size=block_size) self._nbytes = len(self._buffer) else: self._mv = memoryview(self._buffer) self._nbytes = len(self._buffer) else: raise NotImplementedError
def _write_init(self): self._buffer = buf = self._plasma_client.create( self._object_id, self._size) file = self._file = pa.FixedSizeBufferWriter(buf) file.set_memcopy_threads(6)
def hello(): channel = grpc.insecure_channel('untrusted:50051') stub = codeRunner_pb2_grpc.codeRunnerStub(channel) rand = random.choice([True, False]) from pyarrow import csv fn = "IRAhandle_tweets_1.csv" if rand else "mimic.csv" table = csv.read_csv(fn) start = time.clock() print("data loaded") batches = table.to_batches() print(1) client = plasma.connect("/tmp/plasma") print(2) code = """ import time while True: print(7) time.sleep(0.5) """ if False else """ import os import pyarrow import sys authors = dataTable.column("author") newData = [] for i in range(len(authors)): newData.append(1 if i == 0 or authors[i] != authors[i-1] else newData[-1]+1) newColumn = dataTable.column(3).from_array("authorTweetCount", [newData]) newTable = dataTable.append_column(newColumn) """ if rand else """ import os import pyarrow import sys ages = dataTable.column("age") maxV = max(ages.to_pylist()) newData = [] for i in ages: newData.append(1 if i == maxV else 0) newColumn = dataTable.column(3).from_array("oldest", [newData]) newTable = dataTable.append_column(newColumn) """ tables = [] for i in range(len(batches)): id_ = randString() strId = makeID(id_) mock_sink = pyarrow.MockOutputStream() #find data size stream_writer = pyarrow.RecordBatchStreamWriter( mock_sink, batches[0].schema) stream_writer.write_batch(batches[i]) stream_writer.close() data_size = mock_sink.size() buf = client.create(strId, data_size) stream = pyarrow.FixedSizeBufferWriter(buf) stream_writer = pyarrow.RecordBatchStreamWriter( stream, batches[0].schema) stream_writer.write_batch(batches[i]) stream_writer.close() client.seal(strId) print("sent batch " + str(i + 1)) codeToSend = codeRunner_pb2.code(toRun=code, id_=id_) newId = stub.runCode(codeToSend, timeout=1) newId = newId.id_ [data] = client.get_buffers([makeID(newId)]) outputBuf = pyarrow.py_buffer(data.to_pybytes()) buffer_ = pyarrow.BufferReader(outputBuf) reader = pyarrow.RecordBatchStreamReader(buffer_) if i == 0: datatable = reader.read_all() else: datatable = pyarrow.concat_tables([ datatable, datatable.from_batches(reader.read_all().to_batches()) ]) html = str(datatable.column("authorTweetCount" if rand else "oldest").data) print("data received after " + str(time.clock() - start)) return html
def testPlasmaSharedStore(self): import pyarrow from pyarrow import plasma store_size = 10 * 1024**2 test_addr = f'127.0.0.1:{get_next_port()}' options.worker.plasma_dir = '/dev/shm' if os.path.exists( '/dev/shm') else '/tmp' with plasma.start_plasma_store(store_size) as (sckt, _), \ create_actor_pool(n_process=1, address=test_addr) as pool: km_ref = pool.create_actor(PlasmaKeyMapActor, uid=PlasmaKeyMapActor.default_uid()) try: plasma_client = plasma.connect(sckt) except TypeError: plasma_client = plasma.connect(sckt, '', 0) store = PlasmaSharedStore(plasma_client, km_ref) self.assertGreater(store.get_actual_capacity(store_size), store_size / 2) session_id = str(uuid.uuid4()) data_list = [ np.random.randint(0, 32767, (655360, ), np.int16) for _ in range(20) ] key_list = [str(uuid.uuid4()) for _ in range(20)] self.assertFalse(store.contains(session_id, str(uuid.uuid4()))) with self.assertRaises(KeyError): store.get(session_id, str(uuid.uuid4())) with self.assertRaises(KeyError): store.get_actual_size(session_id, str(uuid.uuid4())) with self.assertRaises(KeyError): store.seal(session_id, str(uuid.uuid4())) fake_data_key = str(uuid.uuid4()) km_ref.put(session_id, fake_data_key, plasma.ObjectID.from_random()) self.assertFalse(store.contains(session_id, fake_data_key)) self.assertIsNone(km_ref.get(session_id, fake_data_key)) with self.assertRaises(KeyError): km_ref.put(session_id, fake_data_key, plasma.ObjectID.from_random()) store.get(session_id, fake_data_key) self.assertIsNone(km_ref.get(session_id, fake_data_key)) with self.assertRaises(KeyError): km_ref.put(session_id, fake_data_key, plasma.ObjectID.from_random()) store.seal(session_id, fake_data_key) self.assertIsNone(km_ref.get(session_id, fake_data_key)) with self.assertRaises(KeyError): km_ref.put(session_id, fake_data_key, plasma.ObjectID.from_random()) store.get_actual_size(session_id, fake_data_key) self.assertIsNone(km_ref.get(session_id, fake_data_key)) with self.assertRaises(KeyError): km_ref.put(session_id, fake_data_key, plasma.ObjectID.from_random()) store.get_buffer(session_id, fake_data_key) self.assertIsNone(km_ref.get(session_id, fake_data_key)) store.delete(session_id, fake_data_key) with self.assertRaises(SerializationFailed): non_serial = type('non_serial', (object, ), dict(nbytes=10)) store.put(session_id, fake_data_key, non_serial()) self.assertIsNone(km_ref.get(session_id, fake_data_key)) with self.assertRaises(Exception): store.create(session_id, fake_data_key, 'abcd') self.assertIsNone(km_ref.get(session_id, fake_data_key)) with self.assertRaises(StorageFull): store.create(session_id, fake_data_key, store_size * 2) self.assertIsNone(km_ref.get(session_id, fake_data_key)) arrow_ser = pyarrow.serialize(data_list[0]) buf = store.create(session_id, key_list[0], arrow_ser.total_bytes) writer = pyarrow.FixedSizeBufferWriter(buf) arrow_ser.write_to(writer) writer.close() store.seal(session_id, key_list[0]) self.assertTrue(store.contains(session_id, key_list[0])) self.assertEqual(store.get_actual_size(session_id, key_list[0]), arrow_ser.total_bytes) assert_allclose(store.get(session_id, key_list[0]), data_list[0]) assert_allclose( pyarrow.deserialize(store.get_buffer(session_id, key_list[0])), data_list[0]) with self.assertRaises(StorageDataExists): store.create(session_id, key_list[0], arrow_ser.total_bytes) self.assertIsNotNone(km_ref.get(session_id, key_list[0])) store.delete(session_id, key_list[0]) del buf bufs = [] for key, data in zip(key_list, data_list): try: bufs.append(store.put(session_id, key, data)) except StorageFull: break del bufs store._plasma_limit = 0 with self.assertRaises(StorageFull): store.create(session_id, fake_data_key, store_size * 2)
def read_bytes( vineyard_socket: str, path: str, storage_options: Dict, read_options: Dict, proc_num: int, proc_index: int, ): """Read bytes from external storage and produce a ByteStream, which will later be assembled into a ParallelStream. Args: vineyard_socket (str): Ipc socket path (str): External storage path to write to storage_options (dict): Configurations of external storage read_options (dict): Additional options that could control the behavior of read proc_num (int): Total amount of process proc_index (int): The sequence of this process Raises: ValueError: If the stream is invalid. """ client = vineyard.connect(vineyard_socket) builder = ByteStreamBuilder(client) serialization_mode = read_options.pop('serialization_mode', False) if serialization_mode: parsed = urlparse(path) try: fs = fsspec.filesystem(parsed.scheme) except ValueError as e: report_status("error", str(e)) raise meta_file = f"{path}_{proc_index}.meta" blob_file = f"{path}_{proc_index}" if not fs.exists(meta_file) or not fs.exists(blob_file): report_status("error", f"Some serialization file cannot be found. Expected: {meta_file} and {blob_file}") raise FileNotFoundError('{}, {}'.format(meta_file, blob_file)) # Used for read bytes of serialized graph meta_file = fsspec.open(meta_file, mode="rb", **storage_options) with meta_file as f: meta = f.read().decode('utf-8') meta = json.loads(meta) lengths = meta.pop("lengths") for k, v in meta.items(): builder[k] = v stream = builder.seal(client) client.persist(stream) ret = {"type": "return", "content": repr(stream.id)} print(json.dumps(ret), flush=True) writer = stream.open_writer(client) of = fsspec.open(blob_file, mode="rb", **storage_options) with of as f: try: total_size = f.size() except TypeError: total_size = f.size assert total_size == sum(lengths), "Target file is corrupted" for length in lengths: buf = f.read(length) chunk = writer.next(length) buf_writer = pa.FixedSizeBufferWriter(chunk) buf_writer.write(buf) buf_writer.close() writer.finish() else: # Used when reading tables from external storage. # Usually for load a property graph header_row = read_options.get("header_row", False) for k, v in read_options.items(): if k in ("header_row", "include_all_columns"): builder[k] = "1" if v else "0" elif k == "delimiter": builder[k] = bytes(v, "utf-8").decode("unicode_escape") else: builder[k] = v offset = 0 chunk_size = 1024 * 1024 * 4 try: of = fsspec.open(path, mode="rb", **storage_options) except Exception as e: report_status("error", str(e)) raise with of as f: header_line = read_block(f, 0, 1, b'\n') builder["header_line"] = header_line.decode("unicode_escape") if header_row: offset = len(header_line) stream = builder.seal(client) client.persist(stream) ret = {"type": "return", "content": repr(stream.id)} print(json.dumps(ret), flush=True) writer = stream.open_writer(client) try: total_size = f.size() except TypeError: total_size = f.size part_size = (total_size - offset) // proc_num begin = part_size * proc_index + offset end = min(begin + part_size, total_size) if proc_index == 0: begin -= int(header_row) while begin < end: buf = read_block(f, begin, min(chunk_size, end - begin), delimiter=b"\n") size = len(buf) if not size: break begin += size - 1 chunk = writer.next(size) buf_writer = pa.FixedSizeBufferWriter(chunk) buf_writer.write(buf) buf_writer.close() writer.finish()
def _write_init(self): self._buffer = buf = self._client.create_blob(self._size) self._object_id = buf.id file = self._file = pa.FixedSizeBufferWriter(buf.buffer) file.set_memcopy_threads(6)
def _output_to_memory( obj: pa.Buffer, client: plasma.PlasmaClient, obj_id: Optional[plasma.ObjectID] = None, metadata: Optional[bytes] = None, memcopy_threads: int = 6, ) -> plasma.ObjectID: """Outputs an object to memory. Args: obj: Object to output to memory. client: A PlasmaClient to interface with the in-memory object store. obj_id: The ID to assign to the `obj` inside the plasma store. If ``None`` then one is randomly generated. metadata: Metadata to add to the `obj` inside the store. memcopy_threads: The number of threads to use to write the `obj` into the object store for large objects. Returns: The ID of the object inside the store. Either the given `obj_id` or a randomly generated one. Raises: MemoryError: If the `obj` does not fit in memory. """ # Check whether the object to be passed in memory actually fits in # memory. We check explicitely instead of trying to insert it, # because inserting an already full Plasma store will start evicting # objects to free up space. However, we want to maintain control # over what objects get evicted. # obj.size -> "The buffer size in bytes." total_size = obj.size if metadata is not None: total_size += len(metadata) occupied_size = sum(obj["data_size"] + obj["metadata_size"] for obj in client.list().values()) # Take a percentage of the maximum capacity such that the message # for object eviction always fits inside the store. store_capacity = Config.MAX_RELATIVE_STORE_CAPACITY * client.store_capacity( ) available_size = store_capacity - occupied_size if total_size > available_size: raise MemoryError("Object does not fit in memory") # In case no `obj_id` is specified, one has to be generated because # an ID is required for an object to be inserted in the store. if obj_id is None: obj_id = plasma.ObjectID.from_random() # Write the object to the plasma store. If the obj_id already # exists, then it first has to be deleted. Essentially we are # overwriting the data (just like we do for disk) try: buffer = client.create(obj_id, obj.size, metadata=metadata) except plasma.PlasmaObjectExists: client.delete([obj_id]) buffer = client.create(obj_id, obj.size, metadata=metadata) stream = pa.FixedSizeBufferWriter(buffer) stream.set_memcopy_threads(memcopy_threads) stream.write(obj) client.seal(obj_id) return obj_id
def __init__(self, buf): import pyarrow self._buf = buf self._writer = pyarrow.FixedSizeBufferWriter(buf) self._writer.set_memcopy_threads(6) self._decompressor = None
def read_hdfs_bytes(vineyard_socket, path, proc_num, proc_index): client = vineyard.connect(vineyard_socket) builder = ByteStreamBuilder(client) host, port = urlparse(path).netloc.split(':') hdfs = HDFileSystem(host=host, port=int(port), pars={"dfs.client.read.shortcircuit": "false"}) header_row = False fragments = urlparse(path).fragment.split('&') path = urlparse(path).path for frag in fragments: try: k, v = frag.split('=') except: pass else: if k == 'header_row': header_row = (v.upper() == 'TRUE') if header_row: builder[k] = '1' else: builder[k] = '0' elif k == 'delimiter': builder[k] = bytes(v, "utf-8").decode("unicode_escape") elif k == 'include_all_columns': if v.upper() == 'TRUE': builder[k] = '1' else: builder[k] = '0' else: builder[k] = v offset = 0 chunk_size = 1024 * 1024 * 4 header_line = hdfs.read_block(path, 0, 1, b'\n') builder['header_line'] = header_line.decode('unicode_escape') if header_row: offset = len(header_line) stream = builder.seal(client) client.persist(stream) ret = {'type': 'return'} ret['content'] = repr(stream.id) print(json.dumps(ret), flush=True) writer = stream.open_writer(client) total_size = hdfs.info(path)['size'] begin = (total_size - offset) // proc_num * proc_index + offset end = (total_size - offset) // proc_num + begin if proc_index + 1 == proc_num: end = total_size if proc_index: begin = next_delimiter(hdfs, path, begin, end, b'\n') else: begin -= int(header_row) offset = begin while offset < end: buf = hdfs.read_block(path, offset, min(chunk_size, end - offset), b'\n') size = len(buf) if not size: break offset += size - 1 chunk = writer.next(size) buf_writer = pa.FixedSizeBufferWriter(chunk) buf_writer.write(buf) buf_writer.close() writer.finish()