Exemplo n.º 1
0
    def transform_from_node(self, load_node, pipeline):
        """
        内部接口
        """
        from bigflow import ptable

        if self.repeatedly:
            transformed = load_node.repeatedly() \
                .process_by(_TextFromRecord()) \
                .as_type(serde.StrSerde()) \
                .set_effective_key_num(0) \
                .input(0).allow_partial_processing() \
                .done()
        else:
            transformed = load_node \
                .process_by(_TextFromRecord()) \
                .as_type(serde.StrSerde()) \
                .set_effective_key_num(0) \
                .input(0).allow_partial_processing() \
                .done()

        transformed.set_size(load_node.size())

        if self._options.get('partitioned', False):
            transformed_pcollection = pcollection.PCollection(
                transformed, pipeline)
            return ptable.PTable(transformed_pcollection,
                                 key_serde=serde.StrSerde())

        return pcollection.PCollection(transformed.leave_scope(), pipeline)
Exemplo n.º 2
0
 def transform_from_node(self, load_node, pipeline):
     """ inner func """
     from bigflow import ptable
     transformed_pcollection = pcollection.PCollection(
         load_node, pipeline)
     before_post_process = \
         ptable.PTable(transformed_pcollection, key_serde=serde.CPickleSerde())
     return self._user_input_base.post_process(before_post_process)
Exemplo n.º 3
0
def construct(pipeline,
              node,
              type,
              nested_level=None,
              inner_most_type=None,
              key_serdes=None):
    """
    Construct a PType from a LogicalPlan node

    Args:
      pipeline (Pipeline):  the Pipeline constructed PType belongs to
      node (LogicalPlan.Node):  node
      type (class):  class of PType to construct

    Kwargs:
      nested_leve: specify PTable's nested level if PType is a PTable
      inner_most_type:  specify PTable's inner-most type if PType is a PTable

    Returns:
      PType:  PType
    """
    if inner_most_type is ptable.PTable:
        raise ValueError("Invalid value type for PTable")

    if type is pobject.PObject:
        pvalue = pobject.PObject(node, pipeline)
    elif type is pcollection.PCollection:
        pvalue = pcollection.PCollection(node, pipeline)
    else:
        if key_serdes is None:
            key_serdes = [pipeline.default_objector()] * (nested_level + 1)
        if nested_level > 0:
            pvalue = ptable.PTable(construct(pipeline, node, type,
                                             nested_level - 1, inner_most_type,
                                             key_serdes[1:]),
                                   key_serde=key_serdes[0])
        else:
            pvalue = ptable.PTable(inner_most_type(node, pipeline))

    return pvalue
Exemplo n.º 4
0
def group_by_every_record(pvalue, **options):
    """
    group by every record
    """

    pipeline = pvalue.pipeline()
    node = pvalue.node()
    plan = node.plan()
    scope = node.scope()
    shuffle = plan.shuffle(scope, [node])
    shuffle_node = shuffle.node(0).distribute_every()

    from bigflow import serde
    key_serde = serde.StrSerde()
    return ptable.PTable(pcollection.PCollection(shuffle_node, pipeline), key_serde=key_serde)
Exemplo n.º 5
0
def window_into(pvalue, win, **options):
    """
    group by window
    """
    pipeline = pvalue.pipeline()
    key_serde = options.get('key_serde', win.key_serde())
    if not key_serde:
        key_serde = pvalue.pipeline().default_objector()

    node = node_window_by(
            pvalue.node(),
            win,
            options.get('concurrency', None),
            pipeline)

    return ptable.PTable(pcollection.PCollection(node, pipeline), key_serde=key_serde)
Exemplo n.º 6
0
def cogroup(*pcollections, **kargs):
    from bigflow import serde
    """ inner function"""

    if len(pcollections) == 0:
        raise ValueError("No argument")

    pipeline = pcollections[0].pipeline()
    key_serde = kargs.get('key_serde', None)
    if key_serde is None:
        key_serde = serde._key_serde(pcollections[0].serde(),
                                     pipeline.default_objector())

    value_serdes = kargs.get('value_serdes', None)

    if value_serdes is None:
        value_serdes = []
        for p in pcollections:
            value_serdes.append(
                serde._value_serde(p.serde(), pipeline.default_objector()))

    def _make_shuffle(node, value_serde):
        return pcollection.PCollection(node.match_by(KeyReader(None, key_serde)), pipeline)\
            .map(entity.ExtractValueFn(), serde=value_serde, scale=0.8)

    if not all(isinstance(p, pcollection.PCollection) for p in pcollections):
        raise ValueError("cogroup only applied on PCollections")

    plan = pcollections[0].node().plan()
    scope = pcollections[0].node().scope()

    nodes = map(lambda p: p.node(), pcollections)
    shuffle = plan.shuffle(scope, nodes)

    if 'concurrency' in kargs:
        concurrency = kargs['concurrency']
        shuffle.with_concurrency(concurrency)
    elif pipeline.estimate_concurrency:
        concurrency = sum(node.size()
                          for node in nodes) / pipeline.size_per_concurrency
        shuffle.with_concurrency(concurrency)

    results = []
    for i in range(len(nodes)):
        results.append(_make_shuffle(shuffle.node(i), value_serdes[i]))

    return ptable.PTable(tuple(results), key_serde=key_serde)
Exemplo n.º 7
0
def group_by(pvalue, key_extractor, value_extractor, **options):
    """
    only the tuple pair elements of pvalue accepted
    """

    key_serde = options.get('key_serde', pvalue.pipeline().default_objector())
    if value_extractor is None:
        value_serde = options.get('value_serde', pvalue.serde())
    else:
        value_serde = options.get('value_serde',
                                  pvalue.pipeline().default_objector())

    pipeline = pvalue.pipeline()

    node = node_group_by(pvalue.node(), key_extractor, value_extractor,
                         key_serde, value_serde,
                         options.get('concurrency', None), pipeline)
    return ptable.PTable(pcollection.PCollection(node, pipeline),
                         key_serde=key_serde)
Exemplo n.º 8
0
    def transform_from_node(self, load_node, pipeline):
        """
        内部接口
        """
        from bigflow import ptable
        if self.repeatedly:
            transformed = load_node.repeatedly() \
                .process_by(_KVFromBinaryRecord()) \
                .as_type(serde.tuple_of(serde.StrSerde(), serde.StrSerde())) \
                .set_effective_key_num(0) \
                .input(0).allow_partial_processing() \
                .done()
        else:
            transformed = load_node \
                .process_by(_KVFromBinaryRecord()) \
                .as_type(serde.tuple_of(serde.StrSerde(), serde.StrSerde())) \
                .set_effective_key_num(0) \
                .ignore_group() \
                .input(0).allow_partial_processing() \
                .done()

        transformed.set_size(load_node.size())

        transformed = pcollection.PCollection(transformed, pipeline)

        tserde = self._options.get('serde', pipeline.default_objector())

        if self.kv_deserializer is not None:
            transformed = transformed.map(self.kv_deserializer, serde=tserde)
        else:
            is_serialize = False
            deserialize = entity.SerdeWrapper(tserde, is_serialize, 1)
            transformed = transformed.map(deserialize, serde=tserde)

        if self._options.get('partitioned'):
            return ptable.PTable(transformed, key_serde=serde.StrSerde())
        return pcollection.PCollection(transformed.node().leave_scope(),
                                       pipeline)