def f(index, iterator): import pyarrow.plasma as plasma from zoo.orca.data.utils import get_node_ip # mapPartition would set the same random seed for each partition? # Here use the partition index to override the random seed so that there won't be # identical object_ids in plasma. random.seed(seed + str(index)) res = list(iterator) client = plasma.connect(object_store_address) object_id = client.put(res) yield object_id, get_node_ip()
def f(index, iterator): import pyarrow.plasma as plasma from zoo.orca.data.utils import get_node_ip res = list(iterator) client = plasma.connect(object_store_address) target_id = ids[index] # If the ObjectID exists in plasma, we assume a task trial # succeeds and the data is already in the object store. if not client.contains(target_id): object_id = client.put(res, target_id) assert object_id == target_id, \ "Errors occurred when putting data into plasma object store" client.disconnect() yield target_id, get_node_ip()