def parallelize(self, data: Iterable, include_key=False, name=None, partition=1, namespace=None, create_if_missing=True, error_if_exist=False, persistent=False, chunk_size=100000, in_place_computing=False): if namespace is None: namespace = _EggRoll.get_instance().job_id if name is None: name = str(uuid.uuid1()) storage_locator = storage_basic_pb2.StorageLocator( type=storage_basic_pb2.LMDB, namespace=namespace, name=name) if persistent else storage_basic_pb2.StorageLocator( type=storage_basic_pb2.IN_MEMORY, namespace=namespace, name=name) create_table_info = kv_pb2.CreateTableInfo( storageLocator=storage_locator, fragmentCount=partition) _table = self._create_table(create_table_info) _table.set_in_place_computing(in_place_computing) _iter = data if include_key else enumerate(data) _table.put_all(_iter, chunk_size=chunk_size) LOGGER.debug("created table: %s", _table) return _table
def __get_locator(self, obj, name=None): if isinstance(obj, _DTable): return storage_basic_pb2.StorageLocator(type=obj._type, namespace=obj._namespace, name=obj._name, fragment=obj._partitions) else: return storage_basic_pb2.StorageLocator(type=storage_basic_pb2.LMDB, namespace=self.job_id, name=name)
def join(self, _left: _DTable, _right: _DTable, func): func_id, func_bytes = self.serialize_and_hash_func(func) l_op = storage_basic_pb2.StorageLocator(namespace=_left._namespace, type=_left._type, name=_left._name) r_op = storage_basic_pb2.StorageLocator(namespace=_right._namespace, type=_right._type, name=_right._name) binary_p = processor_pb2.BinaryProcess(left=l_op, right=r_op, info=processor_pb2.TaskInfo(task_id=self.job_id, function_id=func_id, function_bytes=func_bytes)) resp = self.proc_stub.join(binary_p) return self._create_table_from_locator(resp, _left._partitions)
def sample(self, request, context): task_info = request.info LOGGER.debug(PROCESS_RECV_FORMAT.format('send', task_info)) op = request.operand _serdes = self._serdes fraction, seed = cloudpickle.loads(task_info.function_bytes) source_db_path = Processor.get_path(op) rtn = storage_basic_pb2.StorageLocator(namespace=task_info.task_id, name=task_info.function_id, fragment=op.fragment, type=storage_basic_pb2.IN_MEMORY) with Processor.get_environment(Processor.get_path(rtn), create_if_missing=True) as dest_env, Processor.get_environment( source_db_path) as source_env: with source_env.begin() as source_txn: with dest_env.begin(write=True) as dest_txn: cursor = source_txn.cursor() cursor.first() random_state = np.random.RandomState(seed) for k, v in cursor: if random_state.rand() < fraction: dest_txn.put(k, v) LOGGER.debug(PROCESS_DONE_FORMAT.format('sample', rtn)) return rtn
def join(self, request, context): task_info = request.info LOGGER.debug(PROCESS_RECV_FORMAT.format('join', task_info)) _joiner, _serdes = self.get_function_and_serdes(task_info) left_op = request.left right_op = request.right rtn = storage_basic_pb2.StorageLocator(namespace=task_info.task_id, name=task_info.function_id, fragment=left_op.fragment, type=storage_basic_pb2.IN_MEMORY) with Processor.get_environment(Processor.get_path(left_op)) as left_env, Processor.get_environment( Processor.get_path(right_op)) as right_env, Processor.get_environment(Processor.get_path(rtn), create_if_missing=True) as dst_env: with left_env.begin() as left_txn, right_env.begin() as right_txn, dst_env.begin(write=True) as dst_txn: cursor = left_txn.cursor() for k_bytes, v1_bytes in cursor: v2_bytes = right_txn.get(k_bytes) if v2_bytes is None: continue v1 = _serdes.deserialize(v1_bytes) v2 = _serdes.deserialize(v2_bytes) v3 = _joiner(v1, v2) dst_txn.put(k_bytes, _serdes.serialize(v3)) cursor.close() LOGGER.debug(PROCESS_DONE_FORMAT.format('join', rtn)) return rtn
def table(self, name, namespace, partition=1, create_if_missing=True, error_if_exist=False, persistent=True): _type = storage_basic_pb2.LMDB if persistent else storage_basic_pb2.IN_MEMORY storage_locator = storage_basic_pb2.StorageLocator(type=_type, namespace=namespace, name=name) create_table_info = kv_pb2.CreateTableInfo(storageLocator=storage_locator, fragmentCount=partition) _table = self._create_table(create_table_info) LOGGER.debug("created table: %s", _table) return _table
def __create_output_storage_locator(self, src_op, task_info, process_conf, is_in_place_computing_effective): if is_in_place_computing_effective: if self.__get_in_place_computing_from_task_info(task_info): return src_op naming_policy = process_conf.namingPolicy LOGGER.info('naming policy in processor: {}'.format(naming_policy)) if naming_policy == 'ITER_AWARE': storage_name = DELIMETER.join([ src_op.namespace, src_op.name, storage_basic_pb2.StorageType.Name(src_op.type) ]) name_ba = bytearray(storage_name.encode()) name_ba.extend(DELIMETER_ENCODED) name_ba.extend(task_info.function_bytes) name = hashlib.md5(name_ba).hexdigest() else: name = task_info.function_id return storage_basic_pb2.StorageLocator( namespace=task_info.task_id, name=name, fragment=src_op.fragment, type=storage_basic_pb2.IN_MEMORY)
def glom(self, _table: _DTable): func_id = str(uuid.uuid1()) operand = storage_basic_pb2.StorageLocator(namespace=_table._namespace, type=_table._type, name=_table._name) unary_p = processor_pb2.UnaryProcess(operand=operand, info=processor_pb2.TaskInfo(task_id=self.job_id, function_id=func_id)) resp = self.proc_stub.glom(unary_p) return self._create_table_from_locator(resp, _table._partitions)
def map_partitions(self, _table: _DTable, func): func_id, func_bytes = self.serialize_and_hash_func(func) operand = storage_basic_pb2.StorageLocator(namespace=_table._namespace, type=_table._type, name=_table._name) unary_p = processor_pb2.UnaryProcess(operand=operand, info=processor_pb2.TaskInfo(task_id=self.job_id, function_id=func_id, function_bytes=func_bytes)) resp = self.proc_stub.mapPartitions(unary_p) return self._create_table_from_locator(resp, _table._partitions)
def sample(self, _table: _DTable, fraction, seed): if fraction < 0 or fraction > 1: raise ValueError("fraction must be in [0, 1]") func_bytes = self.value_serdes.serialize((fraction, seed)) func_id = str(uuid.uuid1()) operand = storage_basic_pb2.StorageLocator(namespace=_table._namespace, type=_table._type, name=_table._name) unary_p = processor_pb2.UnaryProcess(operand=operand, info=processor_pb2.TaskInfo(task_id=self.job_id, function_id=func_id, function_bytes=func_bytes)) resp = self.proc_stub.sample(unary_p) return self._create_table_from_locator(resp, _table._partitions)
def cleanup(self, name, namespace, persistent): if namespace is None or name is None: raise ValueError("neither name nor namespace can be None") _type = storage_basic_pb2.LMDB if persistent else storage_basic_pb2.IN_MEMORY storage_locator = storage_basic_pb2.StorageLocator(type=_type, namespace=namespace, name=name) _table = _DTable(storage_locator=storage_locator) self.destroy_all(_table) LOGGER.debug("cleaned up: %s", _table)
def reduce(self, _table: _DTable, func): func_id, func_bytes = self.serialize_and_hash_func(func) operand = storage_basic_pb2.StorageLocator(namespace=_table._namespace, type=_table._type, name=_table._name) unary_p = processor_pb2.UnaryProcess(operand=operand, info=processor_pb2.TaskInfo(task_id=self.job_id, function_id=func_id, function_bytes=func_bytes)) values = [_EggRoll._deserialize_operand(operand) for operand in self.proc_stub.reduce(unary_p)] values = [v for v in filter(partial(is_not, None), values)] if len(values) <= 0: return None if len(values) == 1: return values[0] else: val, *remain = values for _nv in remain: val = func(val, _nv) return val
def map(self, request, context): task_info = request.info LOGGER.debug(PROCESS_RECV_FORMAT.format('map', task_info)) _mapper, _serdes = self.get_function_and_serdes(task_info) op = request.operand rtn = storage_basic_pb2.StorageLocator(namespace=task_info.task_id, name=task_info.function_id, fragment=op.fragment, type=storage_basic_pb2.IN_MEMORY) src_db_path = Processor.get_path(op) dst_db_path = Processor.get_path(rtn) with Processor.get_environment(dst_db_path, create_if_missing=True) as dst_env, Processor.get_environment( src_db_path) as source_env: with source_env.begin() as source_txn, dst_env.begin(write=True) as dst_txn: cursor = source_txn.cursor() for k_bytes, v_bytes in cursor: k, v = _serdes.deserialize(k_bytes), _serdes.deserialize(v_bytes) k1, v1 = _mapper(k, v) dst_txn.put(_serdes.serialize(k1), _serdes.serialize(v1)) cursor.close() LOGGER.debug(PROCESS_DONE_FORMAT.format('map', rtn)) return rtn
def mapPartitions(self, request, context): task_info = request.info LOGGER.debug(PROCESS_RECV_FORMAT.format('mapPartitions', task_info)) _mapper, _serdes = self.get_function_and_serdes(task_info) op = request.operand rtn = storage_basic_pb2.StorageLocator(namespace=task_info.task_id, name=task_info.function_id, fragment=op.fragment, type=storage_basic_pb2.IN_MEMORY) src_db_path = Processor.get_path(op) dst_db_path = Processor.get_path(rtn) with Processor.get_environment(dst_db_path, create_if_missing=True) as dst_env, Processor.get_environment( src_db_path) as src_env: with src_env.begin() as src_txn, dst_env.begin(write=True) as dst_txn: cursor = src_txn.cursor() v = _mapper(generator(_serdes, cursor)) if cursor.last(): k_bytes = cursor.key() dst_txn.put(k_bytes, _serdes.serialize(v)) cursor.close() LOGGER.debug(PROCESS_DONE_FORMAT.format('mapPartitions', rtn)) return rtn
def glom(self, request, context): task_info = request.info LOGGER.debug(PROCESS_RECV_FORMAT.format('glom', task_info)) op = request.operand _serdes = self._serdes src_db_path = Processor.get_path(op) rtn = storage_basic_pb2.StorageLocator(namespace=task_info.task_id, name=task_info.function_id, fragment=op.fragment, type=storage_basic_pb2.IN_MEMORY) with Processor.get_environment(src_db_path) as source_env, Processor.get_environment(Processor.get_path(rtn), create_if_missing=True) as dst_env: with source_env.begin() as srce_txn, dst_env.begin(write=True) as dst_txn: cursor = srce_txn.cursor() v_list = [] k_bytes = None for k, v in cursor: v_list.append((_serdes.deserialize(k), _serdes.deserialize(v))) k_bytes = k if k_bytes is not None: dst_txn.put(k_bytes, _serdes.serialize(v_list)) LOGGER.debug(PROCESS_DONE_FORMAT.format('glom', rtn)) return rtn
def __create_storage_locator(self, namespace, name, type): return storage_basic_pb2.StorageLocator(namespace=namespace, name=name, type=type)