Exemplo n.º 1
0
    def mapPartitions(self, _table, func):
        func_id, func_bytes = self.serialize_and_hash_func(func)
        results = []

        for partition in range(_table.partition):
            operand = EggRoll.__get_storage_locator(_table, partition)
            unary_p = processor_pb2.UnaryProcess(operand=operand,
                                                 info=processor_pb2.TaskInfo(task_id=self.job_id,
                                                                             function_id=func_id,
                                                                             function_bytes=func_bytes))

            proc_id = partition % len(self.proc_list)
            channel, stub = self.proc_list[proc_id]
            results.append(stub.mapPartitions.future(unary_p))
        for r in results:
            result = r.result()
        return _DTable(self, result.type, result.namespace, result.name, _table.partition)
Exemplo n.º 2
0
    def join(self, left, right, func):
        func_id, func_bytes = self.serialize_and_hash_func(func)

        results = []
        res = None
        for partition in range(left.partition):
            l_op = EggRoll.__get_storage_locator(left, partition)
            r_op = EggRoll.__get_storage_locator(right, partition)
            binary_p = processor_pb2.BinaryProcess(left=l_op, right=r_op,
                                                   info=processor_pb2.TaskInfo(task_id=self.job_id,
                                                                               function_id=func_id,
                                                                               function_bytes=func_bytes))
            proc_id = partition % len(self.proc_list)
            channel, stub = self.proc_list[proc_id]
            results.append(stub.join.future(binary_p))
        for r in results:
            res = r.result()
        return _DTable(self, res.type, res.namespace, res.name, left.partition)
Exemplo n.º 3
0
 def reduce(self, _table: _DTable, func):
     func_id, func_bytes = self.serialize_and_hash_func(func)
     operand = storage_basic_pb2.StorageLocator(namespace=_table._namespace, type=_table._type, name=_table._name)
     unary_p = processor_pb2.UnaryProcess(operand=operand,
                                          info=processor_pb2.TaskInfo(task_id=self.job_id,
                                                                      function_id=func_id,
                                                                      function_bytes=func_bytes))
     values = [_EggRoll._deserialize_operand(operand) for operand in self.proc_stub.reduce(unary_p)]
     values = [v for v in filter(partial(is_not, None), values)]
     if len(values) <= 0:
         return None
     if len(values) == 1:
         return values[0]
     else:
         val, *remain = values
         for _nv in remain:
             val = func(val, _nv)
     return val
Exemplo n.º 4
0
 def sample(self, _table, fraction, seed):
     if fraction < 0 or fraction > 1:
         raise ValueError("fraction must be in [0, 1]")
     func_bytes = self._serdes.serialize((fraction, seed))
     results = []
     func_id = str(uuid.uuid1())
     for p in range(_table.partition):
         operand = EggRoll.__get_storage_locator(_table, p)
         unary_p = processor_pb2.UnaryProcess(
             operand=operand,
             info=processor_pb2.TaskInfo(task_id=self.job_id,
                                         function_id=func_id,
                                         function_bytes=func_bytes))
         proc_id = p % len(self.proc_list)
         channel, stub = self.proc_list[proc_id]
         results.append(stub.sample.future(unary_p))
     for r in results:
         result = r.result()
     return _DTable(self, result.type, result.namespace, result.name,
                    _table.partition)
Exemplo n.º 5
0
 def reduce(self, _table, func):
     func_id, func_bytes = self.serialize_and_hash_func(func)
     rtn = None
     results = []
     for partition in range(_table.partition):
         operand = EggRoll.__get_storage_locator(_table, partition)
         proc_id = partition % len(self.proc_list)
         channel, stub = self.proc_list[proc_id]
         unary_p = processor_pb2.UnaryProcess(operand=operand, info=processor_pb2.TaskInfo(task_id=self.job_id,
                                                                                           function_id=func_id,
                                                                                           function_bytes=func_bytes))
         results = results + list(stub.reduce(unary_p))
     rs = []
     for val in results:
         if len(val.value) > 0:
             rs.append(self._serdes.deserialize(val.value))
     rs = [r for r in filter(partial(is_not, None), rs)]
     if len(results) <= 0:
         return rtn
     rtn = rs[0]
     for r in rs[1:]:
         rtn = func(rtn, r)
     return rtn