示例#1
0
 def reduce(self, _table, func):
     func_id, func_bytes = self.serialize_and_hash_func(func)
     rtn = None
     results = []
     for partition in range(_table.partition):
         operand = EggRoll.__get_storage_locator(_table, partition)
         proc_id = partition % len(self.proc_list)
         channel, stub = self.proc_list[proc_id]
         unary_p = processor_pb2.UnaryProcess(
             operand=operand,
             info=processor_pb2.TaskInfo(task_id=self.job_id,
                                         function_id=func_id,
                                         function_bytes=func_bytes))
         results = results + list(stub.reduce(unary_p))
     rs = []
     for val in results:
         if len(val.value) > 0:
             rs.append(self._serdes.deserialize(val.value))
     rs = [r for r in filter(partial(is_not, None), rs)]
     if len(results) <= 0:
         return rtn
     rtn = rs[0]
     for r in rs[1:]:
         rtn = func(rtn, r)
     return rtn
示例#2
0
    def __create_task_info(self, func, is_in_place_computing):
        if func:
            func_id, func_bytes = self.serialize_and_hash_func(func)
        else:
            func_id = str(uuid.uuid1())
            func_bytes = b'blank'

        return processor_pb2.TaskInfo(task_id=self.session_id,
                                      function_id=func_id,
                                      function_bytes=func_bytes,
                                      isInPlaceComputing=is_in_place_computing)
示例#3
0
 def glom(self, _table):
     results = []
     func_id = str(uuid.uuid1())
     for p in range(_table.partition):
         operand = EggRoll.__get_storage_locator(_table, p)
         unary_p = processor_pb2.UnaryProcess(operand=operand,
                                              info=processor_pb2.TaskInfo(
                                                  task_id=self.job_id,
                                                  function_id=func_id))
         proc_id = p % len(self.proc_list)
         channel, stub = self.proc_list[proc_id]
         results.append(stub.glom.future(unary_p))
     for r in results:
         result = r.result()
     return _DTable(self, result.type, result.namespace, result.name,
                    _table.partition)
示例#4
0
    def mapValues(self, _table, func):
        func_id, func_bytes = self.serialize_and_hash_func(func)
        results = []
        for partition in range(_table.partition):
            operand = EggRoll.__get_storage_locator(_table, partition)
            unary_p = processor_pb2.UnaryProcess(
                operand=operand,
                info=processor_pb2.TaskInfo(task_id=self.job_id,
                                            function_id=func_id,
                                            function_bytes=func_bytes))

            proc_id = partition % len(self.proc_list)
            channel, stub = self.proc_list[proc_id]
            results.append(stub.mapValues.future(unary_p))

        for r in results:
            result = r.result()
        return _DTable(self, result.type, result.namespace, result.name,
                       _table.partition)
示例#5
0
    def join(self, left, right, func):
        func_id, func_bytes = self.serialize_and_hash_func(func)

        results = []
        res = None
        for partition in range(left.partition):
            l_op = EggRoll.__get_storage_locator(left, partition)
            r_op = EggRoll.__get_storage_locator(right, partition)
            binary_p = processor_pb2.BinaryProcess(
                left=l_op,
                right=r_op,
                info=processor_pb2.TaskInfo(task_id=self.job_id,
                                            function_id=func_id,
                                            function_bytes=func_bytes))
            proc_id = partition % len(self.proc_list)
            channel, stub = self.proc_list[proc_id]
            results.append(stub.join.future(binary_p))
        for r in results:
            res = r.result()
        return _DTable(self, res.type, res.namespace, res.name, left.partition)
示例#6
0
 def sample(self, _table, fraction, seed):
     if fraction < 0 or fraction > 1:
         raise ValueError("fraction must be in [0, 1]")
     func_bytes = self._serdes.serialize((fraction, seed))
     results = []
     func_id = str(uuid.uuid1())
     for p in range(_table.partition):
         operand = EggRoll.__get_storage_locator(_table, p)
         unary_p = processor_pb2.UnaryProcess(
             operand=operand,
             info=processor_pb2.TaskInfo(task_id=self.job_id,
                                         function_id=func_id,
                                         function_bytes=func_bytes))
         proc_id = p % len(self.proc_list)
         channel, stub = self.proc_list[proc_id]
         results.append(stub.sample.future(unary_p))
     for r in results:
         result = r.result()
     return _DTable(self, result.type, result.namespace, result.name,
                    _table.partition)